Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH v11 net-next 04/12] bpf: expand BPF syscall with program load/unload

260 views
Skip to first unread message

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
eBPF programs are similar to kernel modules. They are loaded by the user
process and automatically unloaded when process exits. Each eBPF program is
a safe run-to-completion set of instructions. eBPF verifier statically
determines that the program terminates and is safe to execute.

The following syscall wrapper can be used to load the program:
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_cnt,
const char *license)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = insns,
.insn_cnt = insn_cnt,
.license = license,
};

return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}
where 'insns' is an array of eBPF instructions and 'license' is a string
that must be GPL compatible to call helper functions marked gpl_only

Upon succesful load the syscall returns prog_fd.
Use close(prog_fd) to unload the program.

User space tests and examples follow in the later patches

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 36 ++++++++++
include/linux/filter.h | 7 +-
include/uapi/linux/bpf.h | 27 ++++++++
kernel/bpf/syscall.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++
net/core/filter.c | 2 +
5 files changed, 242 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2887f3f9da59..8ea6f9923ff2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -46,4 +46,40 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
struct bpf_map *bpf_map_get(struct fd f);

+/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
+ * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
+ * instructions after verifying
+ */
+struct bpf_func_proto {
+ u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+ bool gpl_only;
+};
+
+struct bpf_verifier_ops {
+ /* return eBPF function prototype for verification */
+ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
+};
+
+struct bpf_prog_type_list {
+ struct list_head list_node;
+ struct bpf_verifier_ops *ops;
+ enum bpf_prog_type type;
+};
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
+struct bpf_prog_info {
+ atomic_t refcnt;
+ bool is_gpl_compatible;
+ enum bpf_prog_type prog_type;
+ struct bpf_verifier_ops *ops;
+ struct bpf_map **used_maps;
+ u32 used_map_cnt;
+};
+
+struct bpf_prog;
+
+void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
+
#endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4b59edead908..9727616693e5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -15,6 +15,7 @@
struct sk_buff;
struct sock;
struct seccomp_data;
+struct bpf_prog_info;

/* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -302,8 +303,12 @@ struct bpf_work_struct {
struct bpf_prog {
u16 pages; /* Number of allocated pages */
bool jited; /* Is our filter JIT'ed? */
+ bool has_info; /* whether 'info' is valid */
u32 len; /* Number of filter blocks */
- struct sock_fprog_kern *orig_prog; /* Original BPF program */
+ union {
+ struct sock_fprog_kern *orig_prog; /* Original BPF program */
+ struct bpf_prog_info *info;
+ };
struct bpf_work_struct *work; /* Deferred free work struct */
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3a03fdf4db0e..1d0411965576 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -99,12 +99,23 @@ enum bpf_cmd {
* returns zero and stores next key or negative error
*/
BPF_MAP_GET_NEXT_KEY,
+
+ /* verify and load eBPF program
+ * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size)
+ * Using attr->prog_type, attr->insns, attr->license
+ * returns fd or negative error
+ */
+ BPF_PROG_LOAD,
};

enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
};

+enum bpf_prog_type {
+ BPF_PROG_TYPE_UNSPEC,
+};
+
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
enum bpf_map_type map_type;
@@ -126,6 +137,22 @@ union bpf_attr {
#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
};
+
+ struct { /* anonymous struct used by BPF_PROG_LOAD command */
+ enum bpf_prog_type prog_type;
+ __u32 insn_cnt;
+ const struct bpf_insn __user *insns;
+ const char __user *license;
+#define BPF_PROG_LOAD_LAST_FIELD license
+ };
+};
+
+/* integer value in 'imm' field of BPF_CALL instruction selects which helper
+ * function eBPF program intends to call
+ */
+enum bpf_func_id {
+ BPF_FUNC_unspec,
+ __BPF_FUNC_MAX_ID,
};

#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5fbcfeaf7403..4ad6782ac514 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -14,6 +14,8 @@
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>

static LIST_HEAD(bpf_map_types);

@@ -316,6 +318,172 @@ err_put:
return err;
}

+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ struct bpf_prog_type_list *tl;
+
+ list_for_each_entry(tl, &bpf_prog_types, list_node) {
+ if (tl->type == type) {
+ prog->info->ops = tl->ops;
+ prog->info->prog_type = type;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_bpf_prog_info(struct bpf_prog_info *info)
+{
+ int i;
+
+ for (i = 0; i < info->used_map_cnt; i++)
+ bpf_map_put(info->used_maps[i]);
+
+ kfree(info->used_maps);
+ kfree(info);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ BUG_ON(!prog->has_info);
+ if (atomic_dec_and_test(&prog->info->refcnt)) {
+ free_bpf_prog_info(prog->info);
+ bpf_prog_free(prog);
+ }
+}
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+ .release = bpf_prog_release,
+};
+
+static struct bpf_prog *get_prog(struct fd f)
+{
+ struct bpf_prog *prog;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_prog_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ prog = f.file->private_data;
+
+ return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_prog *prog;
+
+ prog = get_prog(f);
+
+ if (IS_ERR(prog))
+ return prog;
+
+ atomic_inc(&prog->info->refcnt);
+ fdput(f);
+ return prog;
+}
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+ enum bpf_prog_type type = attr->prog_type;
+ struct bpf_prog *prog;
+ int err;
+ char license[128];
+ bool is_gpl;
+
+ if (CHECK_ATTR(BPF_PROG_LOAD))
+ return -EINVAL;
+
+ /* copy eBPF program license from user space */
+ if (strncpy_from_user(license, attr->license, sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, attr->insns,
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = false;
+ prog->has_info = false;
+
+ /* allocate eBPF related auxilary data */
+ err = -ENOMEM;
+ prog->info = kzalloc(sizeof(struct bpf_prog_info), GFP_USER);
+ if (!prog->info)
+ goto free_prog;
+
+ prog->has_info = true;
+ atomic_set(&prog->info->refcnt, 1);
+ prog->info->is_gpl_compatible = is_gpl;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog_info;
+
+ /* run eBPF verifier */
+ /* err = bpf_check(prog, tb); */
+
+ if (err < 0)
+ goto free_prog_info;
+
+ /* eBPF program is ready to be JITed */
+ bpf_prog_select_runtime(prog);
+
+ err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_prog_info;
+
+ return err;
+
+free_prog_info:
+ free_bpf_prog_info(prog->info);
+free_prog:
+ bpf_prog_free(prog);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr *attr;
@@ -357,6 +525,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(attr);
break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/net/core/filter.c b/net/core/filter.c
index dfc716ffa44b..d771e4f03745 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -835,6 +835,7 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
{
struct sock_fprog_kern *fprog = fp->orig_prog;

+ BUG_ON(fp->has_info);
if (fprog) {
kfree(fprog->filter);
kfree(fprog);
@@ -973,6 +974,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)

fp->bpf_func = NULL;
fp->jited = false;
+ fp->has_info = false;

err = bpf_check_classic(fp->insns, fp->len);
if (err) {
--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
in native eBPF programs userspace is using pseudo BPF_CALL instructions
which encode one of 'enum bpf_func_id' inside insn->imm field.
Verifier checks that program using correct function arguments to given func_id.
If all checks passed, kernel needs to fixup BPF_CALL->imm fields by
replacing func_id with in-kernel function pointer.
eBPF interpreter just calls the function.

In-kernel eBPF users continue to use generic BPF_CALL.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ad6782ac514..b04873f6cc60 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -339,6 +339,40 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl)
list_add(&tl->list_node, &bpf_prog_types);
}

+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ * insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+ int i;
+
+ for (i = 0; i < prog->len; i++) {
+ struct bpf_insn *insn = &prog->insnsi[i];
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ /* we reach here when program has bpf_call instructions
+ * and it passed bpf_check(), means that
+ * ops->get_func_proto must have been supplied, check it
+ */
+ BUG_ON(!prog->info->ops->get_func_proto);
+
+ fn = prog->info->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ BUG_ON(!fn->func);
+ insn->imm = fn->func - __bpf_call_base;
+ }
+ }
+}
+
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_bpf_prog_info(struct bpf_prog_info *info)
{
@@ -466,6 +500,9 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_prog_info;

+ /* fixup BPF_CALL->imm field */
+ fixup_bpf_calls(prog);
+
/* eBPF program is ready to be JITed */
bpf_prog_select_runtime(prog);

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
add optional attributes for BPF_PROG_LOAD syscall:
struct {
...
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
void __user *log_buf; /* user supplied buffer */
};

In such case the verifier will return its verification log in the user
supplied buffer which can be used by humans to analyze why verifier
rejected given program

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/uapi/linux/bpf.h | 5 +-
kernel/bpf/verifier.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 239 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1d0411965576..693a797e6b3f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,7 +143,10 @@ union bpf_attr {
__u32 insn_cnt;
const struct bpf_insn __user *insns;
const char __user *license;
-#define BPF_PROG_LOAD_LAST_FIELD license
+ __u32 log_level; /* verbosity level of eBPF verifier */
+ __u32 log_size; /* size of user buffer */
+ void __user *log_buf; /* user supplied buffer */
+#define BPF_PROG_LOAD_LAST_FIELD log_buf
};
};

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6f9c3d6b4d7..5c88cf54bb3b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,9 +125,244 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static void *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM) {
+ verbose("(%02x) r%d = 0x%x\n",
+ insn->code, insn->dst_reg, insn->imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
{
+ void __user *log_ubuf = NULL;
+ struct verifier_env *env;
int ret = -EINVAL;

+ if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto free_env;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto free_env;
+ } else {
+ log_level = 0;
+ }
+
+ /* ret = do_check(env); */
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+free_env:
+ kfree(env);
+ mutex_unlock(&bpf_verifier_lock);
return ret;

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
done as separate commit to ease conflict resolution

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
include/linux/syscalls.h | 3 ++-
include/uapi/asm-generic/unistd.h | 4 +++-
kernel/sys_ni.c | 3 +++
5 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 028b78168d85..9fe1b5d002f0 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -363,3 +363,4 @@
354 i386 seccomp sys_seccomp
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
+357 i386 bpf sys_bpf
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 35dd922727b9..281150b539a2 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -327,6 +327,7 @@
318 common getrandom sys_getrandom
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
+321 common bpf sys_bpf

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0f86d85a9ce4..bda9b81357cc 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
struct perf_event_attr;
struct file_handle;
struct sigaltstack;
+union bpf_attr;

#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -875,5 +876,5 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
const char __user *uargs);
asmlinkage long sys_getrandom(char __user *buf, size_t count,
unsigned int flags);
-
+asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 11d11bc5c78f..22749c134117 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp)
__SYSCALL(__NR_getrandom, sys_getrandom)
#define __NR_memfd_create 279
__SYSCALL(__NR_memfd_create, sys_memfd_create)
+#define __NR_bpf 280
+__SYSCALL(__NR_bpf, sys_bpf)

#undef __NR_syscalls
-#define __NR_syscalls 280
+#define __NR_syscalls 281

/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 391d4ddb6f4b..b4b5083f5f5e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -218,3 +218,6 @@ cond_syscall(sys_kcmp);

/* operate on Secure Computing state */
cond_syscall(sys_seccomp);
+
+/* access BPF programs and maps */
+cond_syscall(sys_bpf);

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions
to refer to process-local map_fd. Scan the program for such instructions and
if FDs are valid, convert them to 'struct bpf_map' pointers which will be used
by verifier to check access to maps in bpf_map_lookup/update() calls.
If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping
BPF_PSEUDO_MAP_FD flag.

Note that eBPF interpreter is generic and knows nothing about pseudo insns.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/filter.h | 6 ++
kernel/bpf/verifier.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 153 insertions(+)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9727616693e5..c105af7420a8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -139,6 +139,12 @@ struct bpf_prog_info;
.off = 0, \
.imm = ((__u64) (IMM)) >> 32 })

+#define BPF_PSEUDO_MAP_FD 1
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c88cf54bb3b..df4eb58f7f0a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,10 +125,15 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
/* single container for all structs
* one verifier_env per bpf_check() call
*/
struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
};

/* verbose verifier prints what it's seeing
@@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn)
}
}

+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ fdput(f);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* remember this map */
+ env->used_maps[env->used_map_cnt++] = map;
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_bpf_prog_info()
+ */
+ atomic_inc(&map->refcnt);
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
{
void __user *log_ubuf = NULL;
@@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (!env)
return -ENOMEM;

+ env->prog = prog;
+
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);

@@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
log_level = 0;
}

+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
/* ret = do_check(env); */

+skip_full_check:
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
@@ -357,11 +479,36 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
goto free_log_buf;
}

+ if (ret == 0 && env->used_map_cnt) {
+ /* if program passed verifier, update used_maps in bpf_prog_info */
+ prog->info->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
+
+ if (!prog->info->used_maps) {
+ ret = -ENOMEM;
+ goto free_log_buf;
+ }
+
+ memcpy(prog->info->used_maps, env->used_maps,
+ sizeof(env->used_maps[0]) * env->used_map_cnt);
+ prog->info->used_map_cnt = env->used_map_cnt;
+
+ /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+ * bpf_ld_imm64 instructions
+ */
+ convert_pseudo_ld_imm64(env);
+ }

free_log_buf:
if (log_level)
vfree(log_buf);
free_env:
+ if (!prog->info->used_maps)
+ /* if we didn't copy map pointers into bpf_prog_info, release
+ * them now. Otherwise free_bpf_prog_info() will release them.
+ */
+ release_maps(env);
kfree(env);
mutex_unlock(&bpf_verifier_lock);
return ret;

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
move instruction macros (like BPF_MOV64_REG or BPF_ALU32_IMM)
from linux/filter.h into uapi/linux/bpf.h
so that userspace programs can use them.

verifier testsuite (in later patches) will be using them.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/filter.h | 226 ----------------------------------------------
include/uapi/linux/bpf.h | 226 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 226 insertions(+), 226 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c105af7420a8..15a96857039a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -37,232 +37,6 @@ struct bpf_prog_info;
/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK 512

-/* Helper macros for filter block array initializers. */
-
-/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
-
-#define BPF_ALU64_REG(OP, DST, SRC) \
- ((struct bpf_insn) { \
- .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = 0 })
-
-#define BPF_ALU32_REG(OP, DST, SRC) \
- ((struct bpf_insn) { \
- .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = 0 })
-
-/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
-
-#define BPF_ALU64_IMM(OP, DST, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
- .dst_reg = DST, \
- .src_reg = 0, \
- .off = 0, \
- .imm = IMM })
-
-#define BPF_ALU32_IMM(OP, DST, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
- .dst_reg = DST, \
- .src_reg = 0, \
- .off = 0, \
- .imm = IMM })
-
-/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
-
-#define BPF_ENDIAN(TYPE, DST, LEN) \
- ((struct bpf_insn) { \
- .code = BPF_ALU | BPF_END | BPF_SRC(TYPE), \
- .dst_reg = DST, \
- .src_reg = 0, \
- .off = 0, \
- .imm = LEN })
-
-/* Short form of mov, dst_reg = src_reg */
-
-#define BPF_MOV64_REG(DST, SRC) \
- ((struct bpf_insn) { \
- .code = BPF_ALU64 | BPF_MOV | BPF_X, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = 0 })
-
-#define BPF_MOV32_REG(DST, SRC) \
- ((struct bpf_insn) { \
- .code = BPF_ALU | BPF_MOV | BPF_X, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = 0 })
-
-/* Short form of mov, dst_reg = imm32 */
-
-#define BPF_MOV64_IMM(DST, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_ALU64 | BPF_MOV | BPF_K, \
- .dst_reg = DST, \
- .src_reg = 0, \
- .off = 0, \
- .imm = IMM })
-
-#define BPF_MOV32_IMM(DST, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_ALU | BPF_MOV | BPF_K, \
- .dst_reg = DST, \
- .src_reg = 0, \
- .off = 0, \
- .imm = IMM })
-
-/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
-#define BPF_LD_IMM64(DST, IMM) \
- BPF_LD_IMM64_RAW(DST, 0, IMM)
-
-#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_LD | BPF_DW | BPF_IMM, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = (__u32) (IMM) }), \
- ((struct bpf_insn) { \
- .code = 0, /* zero is reserved opcode */ \
- .dst_reg = 0, \
- .src_reg = 0, \
- .off = 0, \
- .imm = ((__u64) (IMM)) >> 32 })
-
-#define BPF_PSEUDO_MAP_FD 1
-
-/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
-#define BPF_LD_MAP_FD(DST, MAP_FD) \
- BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
-
-/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */
-
-#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE), \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = IMM })
-
-#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_ALU | BPF_MOV | BPF_SRC(TYPE), \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = IMM })
-
-/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
-
-#define BPF_LD_ABS(SIZE, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
- .dst_reg = 0, \
- .src_reg = 0, \
- .off = 0, \
- .imm = IMM })
-
-/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */
-
-#define BPF_LD_IND(SIZE, SRC, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_LD | BPF_SIZE(SIZE) | BPF_IND, \
- .dst_reg = 0, \
- .src_reg = SRC, \
- .off = 0, \
- .imm = IMM })
-
-/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
-
-#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
- ((struct bpf_insn) { \
- .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = OFF, \
- .imm = 0 })
-
-/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
-
-#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
- ((struct bpf_insn) { \
- .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = OFF, \
- .imm = 0 })
-
-/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
-
-#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
- ((struct bpf_insn) { \
- .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
- .dst_reg = DST, \
- .src_reg = 0, \
- .off = OFF, \
- .imm = IMM })
-
-/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
-
-#define BPF_JMP_REG(OP, DST, SRC, OFF) \
- ((struct bpf_insn) { \
- .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = OFF, \
- .imm = 0 })
-
-/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
-
-#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
- ((struct bpf_insn) { \
- .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
- .dst_reg = DST, \
- .src_reg = 0, \
- .off = OFF, \
- .imm = IMM })
-
-/* Function call */
-
-#define BPF_EMIT_CALL(FUNC) \
- ((struct bpf_insn) { \
- .code = BPF_JMP | BPF_CALL, \
- .dst_reg = 0, \
- .src_reg = 0, \
- .off = 0, \
- .imm = ((FUNC) - __bpf_call_base) })
-
-/* Raw code statement block */
-
-#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
- ((struct bpf_insn) { \
- .code = CODE, \
- .dst_reg = DST, \
- .src_reg = SRC, \
- .off = OFF, \
- .imm = IMM })
-
-/* Program exit */
-
-#define BPF_EXIT_INSN() \
- ((struct bpf_insn) { \
- .code = BPF_JMP | BPF_EXIT, \
- .dst_reg = 0, \
- .src_reg = 0, \
- .off = 0, \
- .imm = 0 })
-
#define bytes_to_bpf_size(bytes) \
({ \
int bpf_size = -EINVAL; \
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 693a797e6b3f..e55bcba833e5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -54,6 +54,232 @@ enum {
/* BPF has 10 general purpose 64-bit registers and stack frame. */
#define MAX_BPF_REG __MAX_BPF_REG

+/* Helper macros for filter block array initializers. */
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */
+
+#define BPF_ENDIAN(TYPE, DST, LEN) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_END | BPF_SRC(TYPE), \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = LEN })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_MOV32_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_MOV32_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM) \
+ BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_DW | BPF_IMM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = (__u32) (IMM) }), \
+ ((struct bpf_insn) { \
+ .code = 0, /* zero is reserved opcode */ \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = ((__u64) (IMM)) >> 32 })
+
+#define BPF_PSEUDO_MAP_FD 1
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */
+
+#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE), \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_SRC(TYPE), \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */
+
+#define BPF_LD_IND(SIZE, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_IND, \
+ .dst_reg = 0, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Function call */
+
+#define BPF_EMIT_CALL(FUNC) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_CALL, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = ((FUNC) - __bpf_call_base) })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = CODE, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN() \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_EXIT, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = 0 })
+
struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
this patch adds all of eBPF verfier documentation and empty bpf_check()

The end goal for the verifier is to statically check safety of the program.

Verifier will catch:
- loops
- out of range jumps
- unreachable instructions
- invalid instructions
- uninitialized register access
- uninitialized stack access
- misaligned stack access
- out of range stack access
- invalid calling convention

More details in Documentation/networking/filter.txt

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
Documentation/networking/filter.txt | 224 +++++++++++++++++++++++++++++++++++
include/linux/bpf.h | 2 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/syscall.c | 2 +-
kernel/bpf/verifier.c | 133 +++++++++++++++++++++
5 files changed, 361 insertions(+), 2 deletions(-)
create mode 100644 kernel/bpf/verifier.c

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 1900d29518f1..f1c90967f748 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1001,6 +1001,99 @@ instruction that loads 64-bit immediate value into a dst_reg.
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
32-bit immediate value into a register.

+eBPF verifier
+-------------
+The safety of the eBPF program is determined in two steps.
+
+First step does DAG check to disallow loops and other CFG validation.
+In particular it will detect programs that have unreachable instructions.
+(though classic BPF checker allows them)
+
+Second step starts from the first insn and descends all possible paths.
+It simulates execution of every insn and observes the state change of
+registers and stack.
+
+At the start of the program the register R1 contains a pointer to context
+and has type PTR_TO_CTX.
+If verifier sees an insn that does R2=R1, then R2 has now type
+PTR_TO_CTX as well and can be used on the right hand side of expression.
+If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE,
+since addition of two valid pointers makes invalid pointer.
+(In 'secure' mode verifier will reject any type of pointer arithmetic to make
+sure that kernel addresses don't leak to unprivileged users)
+
+If register was never written to, it's not readable:
+ bpf_mov R0 = R2
+ bpf_exit
+will be rejected, since R2 is unreadable at the start of the program.
+
+After kernel function call, R1-R5 are reset to unreadable and
+R0 has a return type of the function.
+
+Since R6-R9 are callee saved, their state is preserved across the call.
+ bpf_mov R6 = 1
+ bpf_call foo
+ bpf_mov R0 = R6
+ bpf_exit
+is a correct program. If there was R1 instead of R6, it would have
+been rejected.
+
+load/store instructions are allowed only with registers of valid types, which
+are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked.
+For example:
+ bpf_mov R1 = 1
+ bpf_mov R2 = 2
+ bpf_xadd *(u32 *)(R1 + 3) += R2
+ bpf_exit
+will be rejected, since R1 doesn't have a valid pointer type at the time of
+execution of instruction bpf_xadd.
+
+At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context')
+A callback is used to customize verifier to restrict eBPF program access to only
+certain fields within ctx structure with specified size and alignment.
+
+For example, the following insn:
+ bpf_ld R0 = *(u32 *)(R6 + 8)
+intends to load a word from address R6 + 8 and store it into R0
+If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know
+that offset 8 of size 4 bytes can be accessed for reading, otherwise
+the verifier will reject the program.
+If R6=FRAME_PTR, then access should be aligned and be within
+stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8,
+so it will fail verification, since it's out of bounds.
+
+The verifier will allow eBPF program to read data from stack only after
+it wrote into it.
+Classic BPF verifier does similar check with M[0-15] memory slots.
+For example:
+ bpf_ld R0 = *(u32 *)(R10 - 4)
+ bpf_exit
+is invalid program.
+Though R10 is correct read-only register and has type FRAME_PTR
+and R10 - 4 is within stack bounds, there were no stores into that location.
+
+Pointer register spill/fill is tracked as well, since four (R6-R9)
+callee saved registers may not be enough for some programs.
+
+Allowed function calls are customized with bpf_verifier_ops->get_func_proto()
+The eBPF verifier will check that registers match argument constraints.
+After the call register R0 will be set to return type of the function.
+
+Function calls is a main mechanism to extend functionality of eBPF programs.
+Socket filters may let programs to call one set of functions, whereas tracing
+filters may allow completely different set.
+
+If a function made accessible to eBPF program, it needs to be thought through
+from safety point of view. The verifier will guarantee that the function is
+called with valid arguments.
+
+seccomp vs socket filters have different security restrictions for classic BPF.
+Seccomp solves this by two stage verifier: classic BPF verifier is followed
+by seccomp verifier. In case of eBPF one configurable verifier is shared for
+all use cases.
+
+See details of eBPF verifier in kernel/bpf/verifier.c
+
eBPF maps
---------
'maps' is a generic storage of different types for sharing data between kernel
@@ -1040,6 +1133,137 @@ The map is defined by:
. key size in bytes
. value size in bytes

+Understanding eBPF verifier messages
+------------------------------------
+
+The following are few examples of invalid eBPF programs and verifier error
+messages as seen in the log:
+
+Program with unreachable instructions:
+static struct bpf_insn prog[] = {
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+};
+Error:
+ unreachable insn 1
+
+Program that reads uninitialized register:
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r0 = r2
+ R2 !read_ok
+
+Program that doesn't initialize R0 before exiting:
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r2 = r1
+ 1: (95) exit
+ R0 !read_ok
+
+Program that accesses stack out of bounds:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 +8) = 0
+ invalid stack off=8 size=8
+
+Program that doesn't initialize stack before passing its address into function:
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r2 = r10
+ 1: (07) r2 += -8
+ 2: (b7) r1 = 0x0
+ 3: (85) call 1
+ invalid indirect read from stack off -8+0 size 8
+
+Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ fd 0 is not pointing to valid bpf_map
+
+Program that doesn't check return value of map_lookup_elem() before accessing
+map element:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ 5: (7a) *(u64 *)(r0 +0) = 0
+ R0 invalid mem access 'map_value_or_null'
+
+Program that correctly checks map_lookup_elem() returned value for NULL, but
+accesses the memory with incorrect alignment:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+1
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +4) = 0
+ misaligned access off 4 size 8
+
+Program that correctly checks map_lookup_elem() returned value for NULL and
+accesses memory with correct alignment in one side of 'if' branch, but fails
+to do so in the other side of 'if' branch:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+2
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +0) = 0
+ 7: (95) exit
+
+ from 5 to 8: R0=imm0 R10=fp
+ 8: (7a) *(u64 *)(r0 +0) = 1
+ R0 invalid mem access 'imm'
+
Testing
-------

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8ea6f9923ff2..490551e17c15 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -81,5 +81,7 @@ struct bpf_prog;

void bpf_prog_put(struct bpf_prog *prog);
struct bpf_prog *bpf_prog_get(u32 ufd);
+/* verify correctness of eBPF program */
+int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);

#endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e9f7334ed07a..3c726b0995b7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o syscall.o
+obj-y := core.o syscall.o verifier.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b04873f6cc60..f0e14866d54e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -495,7 +495,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog_info;

/* run eBPF verifier */
- /* err = bpf_check(prog, tb); */
+ err = bpf_check(prog, attr);

if (err < 0)
goto free_prog_info;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..d6f9c3d6b4d7
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,133 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ * .arg1_type = ARG_CONST_MAP_PTR,
+ * .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ * void *key = (void *) (unsigned long) r2;
+ * void *value;
+ *
+ * here kernel can access 'key' and 'map' pointers safely, knowing that
+ * [key, key + map->key_size) bytes are valid and were initialized on
+ * the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
+ * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ int ret = -EINVAL;
+
+ return ret;
+}

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:02 AM9/10/14
to
check that control flow graph of eBPF program is a directed acyclic graph

check_cfg() does:
- detect loops
- detect unreachable instructions
- check that program terminates with BPF_EXIT insn
- check that all branches are within program boundary

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/verifier.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 183 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df4eb58f7f0a..f03257de2bc3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -313,6 +313,185 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
return (struct bpf_map *) (unsigned long) imm64;
}

+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+#define PUSH_INT(I) \
+ do { \
+ if (cur_stack >= insn_cnt) { \
+ ret = -E2BIG; \
+ goto free_st; \
+ } \
+ stack[cur_stack++] = I; \
+ } while (0)
+
+#define PEEK_INT() \
+ ({ \
+ int _ret; \
+ if (cur_stack == 0) \
+ _ret = -1; \
+ else \
+ _ret = stack[cur_stack - 1]; \
+ _ret; \
+ })
+
+#define POP_INT() \
+ ({ \
+ int _ret; \
+ if (cur_stack == 0) \
+ _ret = -1; \
+ else \
+ _ret = stack[--cur_stack]; \
+ _ret; \
+ })
+
+#define PUSH_INSN(T, W, E) \
+ do { \
+ int w = W; \
+ if (E == FALLTHROUGH && st[T] >= (DISCOVERED | FALLTHROUGH)) \
+ break; \
+ if (E == BRANCH && st[T] >= (DISCOVERED | BRANCH)) \
+ break; \
+ if (w < 0 || w >= insn_cnt) { \
+ verbose("jump out of range from insn %d to %d\n", T, w); \
+ ret = -EINVAL; \
+ goto free_st; \
+ } \
+ if (st[w] == 0) { \
+ /* tree-edge */ \
+ st[T] = DISCOVERED | E; \
+ st[w] = DISCOVERED; \
+ PUSH_INT(w); \
+ goto peek_stack; \
+ } else if ((st[w] & 0xF0) == DISCOVERED) { \
+ verbose("back-edge from insn %d to %d\n", T, w); \
+ ret = -EINVAL; \
+ goto free_st; \
+ } else if (st[w] == EXPLORED) { \
+ /* forward- or cross-edge */ \
+ st[T] = DISCOVERED | E; \
+ } else { \
+ verbose("insn state internal bug\n"); \
+ ret = -EFAULT; \
+ goto free_st; \
+ } \
+ } while (0)
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int cur_stack = 0;
+ int *stack;
+ int ret = 0;
+ int *st;
+ int i, t;
+
+ st = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!st)
+ return -ENOMEM;
+
+ stack = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!stack) {
+ kfree(st);
+ return -ENOMEM;
+ }
+
+ st[0] = DISCOVERED; /* mark 1st insn as discovered */
+ PUSH_INT(0);
+
+peek_stack:
+ while ((t = PEEK_INT()) != -1) {
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto free_st;
+ }
+ /* unconditional jump with single edge */
+ PUSH_INSN(t, t + insns[t].off + 1, FALLTHROUGH);
+ } else {
+ /* conditional jump with two edges */
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ PUSH_INSN(t, t + insns[t].off + 1, BRANCH);
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ }
+
+mark_explored:
+ st[t] = EXPLORED;
+ if (POP_INT() == -1) {
+ verbose("pop_int internal bug\n");
+ ret = -EFAULT;
+ goto free_st;
+ }
+ }
+
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (st[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto free_st;
+ }
+ }
+
+free_st:
+ kfree(st);
+ kfree(stack);
+ return ret;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
@@ -462,6 +641,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;

+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
/* ret = do_check(env); */

skip_full_check:

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 AM9/10/14
to
1.
the library includes a trivial set of BPF syscall wrappers:
int bpf_create_map(int key_size, int value_size, int max_entries);
int bpf_update_elem(int fd, void *key, void *value);
int bpf_lookup_elem(int fd, void *key, void *value);
int bpf_delete_elem(int fd, void *key);
int bpf_get_next_key(int fd, void *key, void *next_key);
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct sock_filter_int *insns, int insn_len,
const char *license);
bpf_prog_load() stores verifier log into global bpf_log_buf[] array

2.
test stubs configure eBPF infra with 'unspec' map and program types.
These are fake types used by user space testsuite only.

3.
verifier tests valid and invalid programs and expects predefined
error log messages from kernel.
40 tests so far.

$ sudo ./test_verifier
#0 add+sub+mul OK
#1 unreachable OK
#2 unreachable2 OK
#3 out of range jump OK
#4 out of range jump2 OK
#5 test1 ld_imm64 OK
...

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/Makefile | 4 +
kernel/bpf/test_stub.c | 109 +++++++++
lib/Kconfig.debug | 3 +-
samples/bpf/Makefile | 12 +
samples/bpf/libbpf.c | 89 +++++++
samples/bpf/libbpf.h | 21 ++
samples/bpf/test_verifier.c | 548 +++++++++++++++++++++++++++++++++++++++++++
7 files changed, 785 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/test_stub.c
create mode 100644 samples/bpf/Makefile
create mode 100644 samples/bpf/libbpf.c
create mode 100644 samples/bpf/libbpf.h
create mode 100644 samples/bpf/test_verifier.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3c726b0995b7..45427239f375 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
obj-y := core.o syscall.o verifier.o
+
+ifdef CONFIG_TEST_BPF
+obj-y += test_stub.o
+endif
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..d76ed2c2fe4d
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,109 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+
+/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
+ * to be used by user space verifier testsuite
+ */
+struct bpf_context {
+ u64 arg1;
+ u64 arg2;
+};
+
+static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+static struct bpf_func_proto test_funcs[] = {
+ [BPF_FUNC_unspec] = {
+ .func = test_func,
+ .gpl_only = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ },
+};
+
+static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
+{
+ if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+ return NULL;
+ return &test_funcs[func_id];
+}
+
+static const struct bpf_context_access {
+ int size;
+ enum bpf_access_type type;
+} test_ctx_access[] = {
+ [offsetof(struct bpf_context, arg1)] = {
+ FIELD_SIZEOF(struct bpf_context, arg1),
+ BPF_READ
+ },
+ [offsetof(struct bpf_context, arg2)] = {
+ FIELD_SIZEOF(struct bpf_context, arg2),
+ BPF_READ
+ },
+};
+
+static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ const struct bpf_context_access *access;
+
+ if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
+ return false;
+
+ access = &test_ctx_access[off];
+ if (access->size == size && (access->type & type))
+ return true;
+
+ return false;
+}
+
+static struct bpf_verifier_ops test_ops = {
+ .get_func_proto = test_func_proto,
+ .is_valid_access = test_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl_prog = {
+ .ops = &test_ops,
+ .type = BPF_PROG_TYPE_UNSPEC,
+};
+
+static struct bpf_map *test_map_alloc(union bpf_attr *attr)
+{
+ static struct bpf_map map;
+
+ map.key_size = attr->key_size;
+ map.value_size = attr->value_size;
+ map.max_entries = attr->max_entries;
+ return &map;
+}
+
+static void test_map_free(struct bpf_map *map)
+{
+}
+
+static struct bpf_map_ops test_map_ops = {
+ .map_alloc = test_map_alloc,
+ .map_free = test_map_free,
+};
+
+static struct bpf_map_type_list tl_map = {
+ .ops = &test_map_ops,
+ .type = BPF_MAP_TYPE_UNSPEC,
+};
+
+static int __init register_test_ops(void)
+{
+ bpf_register_map_type(&tl_map);
+ bpf_register_prog_type(&tl_prog);
+ return 0;
+}
+late_initcall(register_test_ops);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a28590083622..3ac43f34437b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1672,7 +1672,8 @@ config TEST_BPF
against the BPF interpreter or BPF JIT compiler depending on the
current setting. This is in particular useful for BPF JIT compiler
development, but also to run regression tests against changes in
- the interpreter code.
+ the interpreter code. It also enables test stubs for eBPF maps and
+ verifier used by user space verifier testsuite.

If unsure, say N.

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
new file mode 100644
index 000000000000..634391797856
--- /dev/null
+++ b/samples/bpf/Makefile
@@ -0,0 +1,12 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := test_verifier
+
+test_verifier-objs := test_verifier.o libbpf.o
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS += -I$(objtree)/usr/include
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
new file mode 100644
index 000000000000..cae0c734274c
--- /dev/null
+++ b/samples/bpf/libbpf.c
@@ -0,0 +1,89 @@
+/* eBPF mini library */
+#include <stdlib.h>
+#include <stdio.h>
+#include <linux/unistd.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/netlink.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include "libbpf.h"
+
+int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
+ int max_entries)
+{
+ union bpf_attr attr = {
+ .map_type = map_type,
+ .key_size = key_size,
+ .value_size = value_size,
+ .max_entries = max_entries
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+
+int bpf_update_elem(int fd, void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ .value = value,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_lookup_elem(int fd, void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ .value = value,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_delete_elem(int fd, void *key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_get_next_key(int fd, void *key, void *next_key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ .next_key = next_key,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+}
+
+#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
+
+char bpf_log_buf[LOG_BUF_SIZE];
+
+int bpf_prog_load(enum bpf_prog_type prog_type,
+ const struct bpf_insn *insns, int prog_len,
+ const char *license)
+{
+ union bpf_attr attr = {
+ .prog_type = prog_type,
+ .insns = insns,
+ .insn_cnt = prog_len / sizeof(struct bpf_insn),
+ .license = license,
+ .log_buf = bpf_log_buf,
+ .log_size = LOG_BUF_SIZE,
+ .log_level = 1,
+ };
+
+ bpf_log_buf[0] = 0;
+
+ return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
new file mode 100644
index 000000000000..b19e39794291
--- /dev/null
+++ b/samples/bpf/libbpf.h
@@ -0,0 +1,21 @@
+/* eBPF mini library */
+#ifndef __LIBBPF_H
+#define __LIBBPF_H
+
+struct bpf_insn;
+
+int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
+ int max_entries);
+int bpf_update_elem(int fd, void *key, void *value);
+int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_delete_elem(int fd, void *key);
+int bpf_get_next_key(int fd, void *key, void *next_key);
+
+int bpf_prog_load(enum bpf_prog_type prog_type,
+ const struct bpf_insn *insns, int insn_len,
+ const char *license);
+
+#define LOG_BUF_SIZE 8192
+extern char bpf_log_buf[LOG_BUF_SIZE];
+
+#endif
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
new file mode 100644
index 000000000000..d10992e2740e
--- /dev/null
+++ b/samples/bpf/test_verifier.c
@@ -0,0 +1,548 @@
+/*
+ * Testsuite for eBPF verifier
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include "libbpf.h"
+
+#define MAX_INSNS 512
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct bpf_test {
+ const char *descr;
+ struct bpf_insn insns[MAX_INSNS];
+ int fixup[32];
+ const char *errstr;
+ enum {
+ ACCEPT,
+ REJECT
+ } result;
+};
+
+static struct bpf_test tests[] = {
+ {
+ "add+sub+mul",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_2, 3),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 3),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ },
+ {
+ "unreachable",
+ .insns = {
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable",
+ .result = REJECT,
+ },
+ {
+ "unreachable2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable",
+ .result = REJECT,
+ },
+ {
+ "out of range jump",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+ },
+ {
+ "out of range jump2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, -2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+ },
+ {
+ "test1 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM insn",
+ .result = REJECT,
+ },
+ {
+ "test2 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM insn",
+ .result = REJECT,
+ },
+ {
+ "test3 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+ },
+ {
+ "test4 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+ },
+ {
+ "test5 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+ },
+ {
+ "no bpf_exit",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+ },
+ {
+ "loop (back-edge)",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "back-edge",
+ .result = REJECT,
+ },
+ {
+ "loop2 (back-edge)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "back-edge",
+ .result = REJECT,
+ },
+ {
+ "conditional loop",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "back-edge",
+ .result = REJECT,
+ },
+ {
+ "read uninitialized register",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R2 !read_ok",
+ .result = REJECT,
+ },
+ {
+ "read invalid register",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R15 is invalid",
+ .result = REJECT,
+ },
+ {
+ "program doesn't init R0 before exit",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 !read_ok",
+ .result = REJECT,
+ },
+ {
+ "stack out of bounds",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid stack",
+ .result = REJECT,
+ },
+ {
+ "invalid call insn1",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_CALL uses reserved",
+ .result = REJECT,
+ },
+ {
+ "invalid call insn2",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_CALL uses reserved",
+ .result = REJECT,
+ },
+ {
+ "invalid function call",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 1234567),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid func 1234567",
+ .result = REJECT,
+ },
+ {
+ "uninitialized stack1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {2},
+ .errstr = "invalid indirect read from stack",
+ .result = REJECT,
+ },
+ {
+ "uninitialized stack2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid read from stack",
+ .result = REJECT,
+ },
+ {
+ "check valid spill/fill",
+ .insns = {
+ /* spill R1(ctx) into stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+
+ /* fill it back into R2 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8),
+
+ /* should be able to access R0 = *(R2 + 8) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ },
+ {
+ "check corrupted spill/fill",
+ .insns = {
+ /* spill R1(ctx) into stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+
+ /* mess up with R1 pointer on stack */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23),
+
+ /* fill back into R0 should fail */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "corrupted spill",
+ .result = REJECT,
+ },
+ {
+ "invalid src register in STX",
+ .insns = {
+ BPF_STX_MEM(BPF_B, BPF_REG_10, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R15 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid dst register in STX",
+ .insns = {
+ BPF_STX_MEM(BPF_B, 14, BPF_REG_10, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R14 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid dst register in ST",
+ .insns = {
+ BPF_ST_MEM(BPF_B, 14, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R14 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid src register in LDX",
+ .insns = {
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, 12, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R12 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid dst register in LDX",
+ .insns = {
+ BPF_LDX_MEM(BPF_B, 11, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R11 is invalid",
+ .result = REJECT,
+ },
+ {
+ "junk insn",
+ .insns = {
+ BPF_RAW_INSN(0, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM",
+ .result = REJECT,
+ },
+ {
+ "junk insn2",
+ .insns = {
+ BPF_RAW_INSN(1, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_LDX uses reserved fields",
+ .result = REJECT,
+ },
+ {
+ "junk insn3",
+ .insns = {
+ BPF_RAW_INSN(-1, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_ALU opcode f0",
+ .result = REJECT,
+ },
+ {
+ "junk insn4",
+ .insns = {
+ BPF_RAW_INSN(-1, -1, -1, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_ALU opcode f0",
+ .result = REJECT,
+ },
+ {
+ "junk insn5",
+ .insns = {
+ BPF_RAW_INSN(0x7f, -1, -1, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_ALU uses reserved fields",
+ .result = REJECT,
+ },
+ {
+ "misaligned read from stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned access",
+ .result = REJECT,
+ },
+ {
+ "invalid map_fd for function call",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "fd 0 is not pointing to valid bpf_map",
+ .result = REJECT,
+ },
+ {
+ "don't check return value before access",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {3},
+ .errstr = "R0 invalid mem access 'map_value_or_null'",
+ .result = REJECT,
+ },
+ {
+ "access memory with incorrect alignment",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {3},
+ .errstr = "misaligned access",
+ .result = REJECT,
+ },
+ {
+ "sometimes access memory with incorrect alignment",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {3},
+ .errstr = "R0 invalid mem access",
+ .result = REJECT,
+ },
+};
+
+static int probe_filter_length(struct bpf_insn *fp)
+{
+ int len = 0;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+
+ return len + 1;
+}
+
+static int create_map(void)
+{
+ long long key, value = 0;
+ int map_fd;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ }
+
+ return map_fd;
+}
+
+static int test(void)
+{
+ int prog_fd, i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct bpf_insn *prog = tests[i].insns;
+ int prog_len = probe_filter_length(prog);
+ int *fixup = tests[i].fixup;
+ int map_fd = -1;
+
+ if (*fixup) {
+ map_fd = create_map();
+
+ do {
+ prog[*fixup].imm = map_fd;
+ fixup++;
+ } while (*fixup);
+ }
+ printf("#%d %s ", i, tests[i].descr);
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
+ prog_len * sizeof(struct bpf_insn),
+ "GPL");
+
+ if (tests[i].result == ACCEPT) {
+ if (prog_fd < 0) {
+ printf("FAIL\nfailed to load prog '%s'\n",
+ strerror(errno));
+ printf("%s", bpf_log_buf);
+ goto fail;
+ }
+ } else {
+ if (prog_fd >= 0) {
+ printf("FAIL\nunexpected success to load\n");
+ printf("%s", bpf_log_buf);
+ goto fail;
+ }
+ if (strstr(bpf_log_buf, tests[i].errstr) == 0) {
+ printf("FAIL\nunexpected error message: %s",
+ bpf_log_buf);
+ goto fail;
+ }
+ }
+
+ printf("OK\n");
+fail:
+ if (map_fd >= 0)
+ close(map_fd);
+ close(prog_fd);
+
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ return test();
+}

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:02 AM9/10/14
to
'maps' is a generic storage of different types for sharing data between kernel
and userspace.

The maps are accessed from user space via BPF syscall, which has commands:

- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error

- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error

- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error

- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key

- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key

- close(fd) deletes the map

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 8 ++
include/uapi/linux/bpf.h | 42 +++++++++
kernel/bpf/syscall.c | 219 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 269 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 48014a71f0fe..2887f3f9da59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -9,6 +9,7 @@

#include <uapi/linux/bpf.h>
#include <linux/workqueue.h>
+#include <linux/file.h>

struct bpf_map;

@@ -17,6 +18,12 @@ struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_free)(struct bpf_map *);
+ int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+
+ /* funcs callable from userspace and from eBPF programs */
+ void *(*map_lookup_elem)(struct bpf_map *map, void *key);
+ int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+ int (*map_delete_elem)(struct bpf_map *map, void *key);
};

struct bpf_map {
@@ -37,5 +44,6 @@ struct bpf_map_type_list {

void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
+struct bpf_map *bpf_map_get(struct fd f);

#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7d83ef63849d..3a03fdf4db0e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -70,6 +70,35 @@ enum bpf_cmd {
* map is deleted when fd is closed
*/
BPF_MAP_CREATE,
+
+ /* lookup key in a given map
+ * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->value
+ * returns zero and stores found elem into value
+ * or negative error
+ */
+ BPF_MAP_LOOKUP_ELEM,
+
+ /* create or update key/value pair in a given map
+ * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->value
+ * returns zero or negative error
+ */
+ BPF_MAP_UPDATE_ELEM,
+
+ /* find and delete elem by key in a given map
+ * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key
+ * returns zero or negative error
+ */
+ BPF_MAP_DELETE_ELEM,
+
+ /* lookup key in a given map and return next key
+ * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->next_key
+ * returns zero and stores next key or negative error
+ */
+ BPF_MAP_GET_NEXT_KEY,
};

enum bpf_map_type {
@@ -84,6 +113,19 @@ union bpf_attr {
__u32 max_entries; /* max number of entries in a map */
#define BPF_MAP_CREATE_LAST_FIELD max_entries
};
+
+ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+ int map_fd;
+ void __user *key;
+ union {
+ void __user *value;
+ void __user *next_key;
+ };
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+ };
};

#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e353eaf3ac59..5fbcfeaf7403 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
+#include <linux/file.h>

static LIST_HEAD(bpf_map_types);

@@ -109,6 +110,212 @@ free_map:
return err;
}

+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+ struct bpf_map *map;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = f.file->private_data;
+
+ return map;
+}
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ void __user *uvalue = attr->value;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ESRCH;
+ rcu_read_lock();
+ value = map->ops->map_lookup_elem(map, key);
+ if (!value)
+ goto err_unlock;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto err_unlock;
+
+ err = 0;
+
+err_unlock:
+ rcu_read_unlock();
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ void __user *uvalue = attr->value;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ void __user *unext_key = attr->next_key;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *next_key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr *attr;
@@ -138,6 +345,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_CREATE:
err = map_create(attr);
break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(attr);
+ break;
default:
err = -EINVAL;
break;

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:02 AM9/10/14
to
BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.

'maps' is a generic storage of different types for sharing data between kernel
and userspace.

Userspace example:
/* this syscall wrapper creates a map with given type and attributes
* and returns map_fd on success.
* use close(map_fd) to delete the map
*/
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};

return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}

syscall is using 'union bpf_attr' to be backwards compatible with future
extensions. Different syscall commands will use different attributes.

More details in Documentation/networking/filter.txt and in manpage

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
Documentation/networking/filter.txt | 39 +++++++++
include/linux/bpf.h | 41 ++++++++++
include/uapi/linux/bpf.h | 24 ++++++
kernel/bpf/Makefile | 2 +-
kernel/bpf/syscall.c | 149 +++++++++++++++++++++++++++++++++++
5 files changed, 254 insertions(+), 1 deletion(-)
create mode 100644 include/linux/bpf.h
create mode 100644 kernel/bpf/syscall.c

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 81916ab5d96f..1900d29518f1 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg.
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
32-bit immediate value into a register.

+eBPF maps
+---------
+'maps' is a generic storage of different types for sharing data between kernel
+and userspace.
+
+The maps are accessed from user space via BPF syscall, which has commands:
+- create a map with given type and attributes
+ map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
+ using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
+ returns process-local file descriptor or negative error
+
+- lookup key in a given map
+ err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+ using attr->map_fd, attr->key, attr->value
+ returns zero and stores found elem into value or negative error
+
+- create or update key/value pair in a given map
+ err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+ using attr->map_fd, attr->key, attr->value
+ returns zero or negative error
+
+- find and delete element by key in a given map
+ err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+ using attr->map_fd, attr->key
+
+- to delete map: close(fd)
+ Exiting process will delete maps automatically
+
+userspace programs use this syscall to create/access maps that eBPF programs
+are concurrently updating.
+
+maps can have different types: hash, array, bloom filter, radix-tree, etc.
+
+The map is defined by:
+ . type
+ . max number of elements
+ . key size in bytes
+ . value size in bytes
+
Testing
-------

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
new file mode 100644
index 000000000000..48014a71f0fe
--- /dev/null
+++ b/include/linux/bpf.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_BPF_H
+#define _LINUX_BPF_H 1
+
+#include <uapi/linux/bpf.h>
+#include <linux/workqueue.h>
+
+struct bpf_map;
+
+/* map is generic key/value storage optionally accesible by eBPF programs */
+struct bpf_map_ops {
+ /* funcs callable from userspace (via syscall) */
+ struct bpf_map *(*map_alloc)(union bpf_attr *attr);
+ void (*map_free)(struct bpf_map *);
+};
+
+struct bpf_map {
+ atomic_t refcnt;
+ enum bpf_map_type map_type;
+ u32 key_size;
+ u32 value_size;
+ u32 max_entries;
+ struct bpf_map_ops *ops;
+ struct work_struct work;
+};
+
+struct bpf_map_type_list {
+ struct list_head list_node;
+ struct bpf_map_ops *ops;
+ enum bpf_map_type type;
+};
+
+void bpf_register_map_type(struct bpf_map_type_list *tl);
+void bpf_map_put(struct bpf_map *map);
+
+#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 479ed0b6be16..7d83ef63849d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -62,4 +62,28 @@ struct bpf_insn {
__s32 imm; /* signed immediate constant */
};

+/* BPF syscall commands */
+enum bpf_cmd {
+ /* create a map with given type and attributes
+ * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
+ * returns fd or negative error
+ * map is deleted when fd is closed
+ */
+ BPF_MAP_CREATE,
+};
+
+enum bpf_map_type {
+ BPF_MAP_TYPE_UNSPEC,
+};
+
+union bpf_attr {
+ struct { /* anonymous struct used by BPF_MAP_CREATE command */
+ enum bpf_map_type map_type;
+ __u32 key_size; /* size of key in bytes */
+ __u32 value_size; /* size of value in bytes */
+ __u32 max_entries; /* max number of entries in a map */
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+ };
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..e9f7334ed07a 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o
+obj-y := core.o syscall.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..e353eaf3ac59
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,149 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+
+ bpf_map_put(map);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD)) != NULL
+
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ map->ops->map_free(map);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr *attr;
+ int err;
+
+ /* the syscall is limited to root temporarily. This restriction will be
+ * lifted when security audit is clean. Note that eBPF+tracing must have
+ * this restriction, since it may pass kernel data to user space
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* newer userspace cannot run with older kernel */
+ if (size > sizeof(*attr))
+ return -EINVAL;
+
+ attr = kzalloc(sizeof(*attr), GFP_USER);
+ if (!attr)
+ return -ENOMEM;
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ err = -EFAULT;
+ if (copy_from_user(attr, uattr, size) != 0)
+ goto free_attr;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+free_attr:
+ kfree(attr);
+ return err;
+}

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:02 AM9/10/14
to
Hi David,

I've managed to reduce this set to 12:
Patches 1-4 establish BPF syscall shell for maps and programs.
Patches 5-10 add verifier step by step
Patch 11 exposes existing instruction macros to user space
Patch 12 adds test stubs and verifier testsuite from user space

I don't know how to reduce it further. Drop verifier and
have programs loaded without verification? Sounds wrong.
If anyone has other ideas, I'll gladly reduce it further.

Note that patches 1,3,4,7 add commands and attributes to the syscall
while being backwards compatible from each other, which should demonstrate
how other commands can be added in the future.

Daniel,
bpf_common.h patch (that we discussed earlier) I didn't include here
to reduce the number of patches. It can come next.

For those who have looked at the last set of 28 patches, the difference is:
- moved attaching to tracing and sockets to future patches
- moved hash table map type implementation to future
- split verifier further and moved LD_ABS checks and state prunning to future
- instead of running verifier testsuite on real tracing programs added
test_stub.c with fake maps, context and helper functions to test verifier only
- rebased

Note, after this set the programs can be loaded for testing only. They cannot
be attached to any events. This will come in the next set.

As requested by Andy and others, here is the man page:

BPF(2) Linux Programmer's Manual BPF(2)



NAME
bpf - perform a command on eBPF map or program

SYNOPSIS
#include <linux/bpf.h>

int bpf(int cmd, union bpf_attr *attr, unsigned int size);


DESCRIPTION
bpf() syscall is a multiplexor for a range of different operations on
eBPF which can be characterized as "universal in-kernel virtual
machine". eBPF is similar to original Berkeley Packet Filter (or "clas-
sic BPF") used to filter network packets. Both statically analyze the
programs before loading them into the kernel to ensure that programs
cannot harm the running system.

eBPF extends classic BPF in multiple ways including ability to call in-
kernel helper functions and access shared data structures like eBPF
maps. The programs can be written in a restricted C that is compiled
into eBPF bytecode and executed on the eBPF virtual machine or JITed
into native instruction set.

eBPF Design/Architecture
eBPF maps is a generic storage of different types. User process can
create multiple maps (with key/value being opaque bytes of data) and
access them via file descriptor. In parallel eBPF programs can access
maps from inside the kernel. It's up to user process and eBPF program
to decide what they store inside maps.

eBPF programs are similar to kernel modules. They are loaded by the
user process and automatically unloaded when process exits. Each eBPF
program is a safe run-to-completion set of instructions. eBPF verifier
statically determines that the program terminates and is safe to exe-
cute. During verification the program takes a hold of maps that it
intends to use, so selected maps cannot be removed until the program is
unloaded. The program can be attached to different events. These events
can be packets, tracepoint events and other types in the future. A new
event triggers execution of the program which may store information
about the event in the maps. Beyond storing data the programs may call
into in-kernel helper functions which may, for example, dump stack, do
trace_printk or other forms of live kernel debugging. The same program
can be attached to multiple events. Different programs can access the
same map:
tracepoint tracepoint tracepoint sk_buff sk_buff
event A event B event C on eth0 on eth1
| | | | |
| | | | |
--> tracing <-- tracing socket socket
prog_1 prog_2 prog_3 prog_4
| | | |
|--- -----| |-------| map_3
map_1 map_2

Syscall Arguments
bpf() syscall operation is determined by cmd which can be one of the
following:

BPF_MAP_CREATE
Create a map with given type and attributes and return map FD

BPF_MAP_LOOKUP_ELEM
Lookup element by key in a given map and return its value

BPF_MAP_UPDATE_ELEM
Create or update element (key/value pair) in a given map

BPF_MAP_DELETE_ELEM
Lookup and delete element by key in a given map

BPF_MAP_GET_NEXT_KEY
Lookup element by key in a given map and return key of next ele-
ment

BPF_PROG_LOAD
Verify and load eBPF program

attr is a pointer to a union of type bpf_attr as defined below.

size is the size of the union.

union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
enum bpf_map_type map_type;
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
};

struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
int map_fd;
void *key;
union {
void *value;
void *next_key;
};
};

struct { /* anonymous struct used by BPF_PROG_LOAD command */
enum bpf_prog_type prog_type;
__u32 insn_cnt;
const struct bpf_insn *insns;
const char *license;
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
void *log_buf; /* user supplied buffer */
};
};

eBPF maps
maps is a generic storage of different types for sharing data between
kernel and userspace.

Any map type has the following attributes:
. type
. max number of elements
. key size in bytes
. value size in bytes

The following wrapper functions demonstrate how this syscall can be
used to access the maps. The functions use the cmd argument to invoke
different operations.

BPF_MAP_CREATE
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};

return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
bpf() syscall creates a map of map_type type and given
attributes key_size, value_size, max_entries. On success it
returns process-local file descriptor or negative error other-
wise.

BPF_MAP_LOOKUP_ELEM
int bpf_lookup_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
.value = value,
};

return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
bpf() syscall looks up an element with given key in a map fd.
If element is found it returns zero and stores element's value
into value. Otherwise negative error is returned.

BPF_MAP_UPDATE_ELEM
int bpf_update_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
.value = value,
};

return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
The call creates or updates element with given key/value in a
map fd. On success it returns zero or negative error otherwise.

BPF_MAP_DELETE_ELEM
int bpf_delete_elem(int fd, void *key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
};

return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
}
The call deletes an element in a map fd with given key.

BPF_MAP_GET_NEXT_KEY
int bpf_get_next_key(int fd, void *key, void *next_key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
.next_key = next_key,
};

return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
}
The call looks up an element by key in a given map fd and
returns key of next element into next_key pointer. On success it
returns zero or negative error otherwise. This method can be
used to iterate over all elements of the map.

close(map_fd)
will delete the map map_fd. Exiting process will delete all
maps automatically.

In the future maps can have different types: hash, array, bloom filter,
radix-tree, but currently only hash type is supported:
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
BPF_MAP_TYPE_HASH,
};

eBPF programs
BPF_PROG_LOAD
This cmd is used to load eBPF program into the kernel.

char bpf_log_buf[LOG_BUF_SIZE];

int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_cnt,
const char *license)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = insns,
.insn_cnt = insn_cnt,
.license = license,
.log_buf = bpf_log_buf,
.log_size = LOG_BUF_SIZE,
.log_level = 1,
};

return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}
prog_type one of the available program types:
enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET_FILTER,
BPF_PROG_TYPE_TRACING_FILTER,
};
insns array of "struct bpf_insn" instructions

insn_cnt number of instructions in the program

license license string, which must be GPL compatible to call
helper functions marked gpl_only

log_buf user supplied buffer that in-kernel verifier is using to
store verification log

log_size size of user buffer

log_level verbosity level of eBPF verifier, where zero means no
logs provided

close(prog_fd)
will unload eBPF program

The maps are accesible from programs and generally tie the two
together. Programs process various events (like tracepoint, kprobe,
packets) and store the data into maps. User space fetches data from
maps. Either the same or a different map may be used by user space as
configuration space to alter program behavior on the fly.

Events
Once an eBPF program is loaded, it can be attached to an event. Various
kernel subsystems have different ways to do so. For example:

setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd));
will attach the program prog_fd to socket sock which was received by
prior call to socket().

ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
will attach the program prog_fd to perf event event_fd which was
received by prior call to perf_event_open().

Another way to attach the program to a tracing event is:
event_fd = open("/sys/kernel/debug/tracing/events/skb/kfree_skb/filter");
write(event_fd, "bpf-123"); /* where 123 is eBPF program FD */
/* here program is attached and will be triggered by events */
close(event_fd); /* to detach from event */

EXAMPLES
/* eBPF+sockets example:
* 1. create map with maximum of 2 elements
* 2. set map[6] = 0 and map[17] = 0
* 3. load eBPF program that counts number of TCP and UDP packets received
* via map[skb->ip->proto]++
* 4. attach prog_fd to raw socket via setsockopt()
* 5. print number of received TCP/UDP packets every second
*/
int main(int ac, char **av)
{
static struct bpf_insn prog[] = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_LD_ABS(BPF_B, 14 + 9 /* R0 = ip->proto */),
BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
BPF_EXIT_INSN(),
};
int sock, map_fd, prog_fd, key;
long long value = 0, tcp_cnt, udp_cnt;

map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2);
if (map_fd < 0) {
printf("failed to create map '%s'\n", strerror(errno));
/* likely not run as root */
return 1;
}

key = 6; /* tcp */
assert(bpf_update_elem(map_fd, &key, &value) == 0);

key = 17; /* udp */
assert(bpf_update_elem(map_fd, &key, &value) == 0);

prog[5].imm = map_fd;
prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
"GPL");
assert(prog_fd >= 0);

sock = open_raw_sock("lo");

assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
sizeof(prog_fd)) == 0);

for (;;) {
key = 6;
assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
key = 17;
assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0);
printf("TCP %lld UDP %lld packets0, tcp_cnt, udp_cnt);
sleep(1);
}

return 0;
}

RETURN VALUE
For a successful call, the return value depends on the operation:

BPF_MAP_CREATE
The new file descriptor associated with eBPF map.

BPF_PROG_LOAD
The new file descriptor associated with eBPF program.

All other commands
Zero.

On error, -1 is returned, and errno is set appropriately.

ERRORS
EPERM bpf() syscall was made without sufficient privilege (without the
CAP_SYS_ADMIN capability).

ENOMEM Cannot allocate sufficient memory.

EBADF fd is not an open file descriptor

EFAULT One of the pointers ( key or value or log_buf or insns ) is out-
side accessible address space.

EINVAL The value specified in cmd is not recognized by this kernel.

EINVAL For BPF_MAP_CREATE, either map_type or attributes are invalid.

EINVAL For BPF_MAP_*_ELEM commands, some of the fields of "union
bpf_attr" unused by this command are not set to zero.

EINVAL For BPF_PROG_LOAD, attempt to load invalid program (unrecognized
instruction or uses reserved fields or jumps out of range or
loop detected or calls unknown function).

EACCES For BPF_PROG_LOAD, though program has valid instructions, it was
rejected, since it was deemed unsafe (may access disallowed mem-
ory region or uninitialized stack/register or function con-
straints don't match actual types or misaligned access). In such
case it is recommended to call bpf() again with log_level = 1
and examine log_buf for specific reason provided by verifier.

ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM, indicates that
element with given key was not found.

E2BIG program is too large.

NOTES
These commands may be used only by a privileged process (one having the
CAP_SYS_ADMIN capability).

SEE ALSO
eBPF architecture and instruction set is explained in Documenta-
tion/networking/filter.txt



Linux 2014-09-01 BPF(2)

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:02 AM9/10/14
to
This patch adds verifier core which simulates execution of every insn and
records the state of registers and program stack. Every branch instruction seen
during simulation is pushed into state stack. When verifier reaches BPF_EXIT,
it pops the state from the stack and continues until it reaches BPF_EXIT again.
For program:
1: bpf_mov r1, xxx
2: if (r1 == 0) goto 5
3: bpf_mov r0, 1
4: goto 6
5: bpf_mov r0, 2
6: bpf_exit
The verifier will walk insns: 1, 2, 3, 4, 6
then it will pop the state recorded at insn#2 and will continue: 5, 6

This way it walks all possible paths through the program and checks all
possible values of registers. While doing so, it checks for:
- invalid instructions
- uninitialized register access
- uninitialized stack access
- misaligned stack access
- out of range stack access
- invalid calling convention
- instruction encoding is not using reserved fields

Kernel subsystem configures the verifier with two callbacks:

- bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
that provides information to the verifer which fields of 'ctx'
are accessible (remember 'ctx' is the first argument to eBPF program)

- const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
returns argument constraints of kernel helper functions that eBPF program
may call, so that verifier can checks that R1-R5 types match the prototype

More details in Documentation/networking/filter.txt and in kernel/bpf/verifier.c

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 47 +++
kernel/bpf/verifier.c | 1003 ++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 1049 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 490551e17c15..ad1bda7ece35 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -46,6 +46,31 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
struct bpf_map *bpf_map_get(struct fd f);

+/* function argument constraints */
+enum bpf_arg_type {
+ ARG_ANYTHING = 0, /* any argument is ok */
+
+ /* the following constraints used to prototype
+ * bpf_map_lookup/update/delete_elem() functions
+ */
+ ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */
+ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */
+ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */
+
+ /* the following constraints used to prototype bpf_memcmp() and other
+ * functions that access data on eBPF program stack
+ */
+ ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */
+ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */
+};
+
+/* type of values returned from helper functions */
+enum bpf_return_type {
+ RET_INTEGER, /* function returns integer */
+ RET_VOID, /* function doesn't return anything */
+ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
+};
+
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
* to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
* instructions after verifying
@@ -53,11 +78,33 @@ struct bpf_map *bpf_map_get(struct fd f);
struct bpf_func_proto {
u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
bool gpl_only;
+ enum bpf_return_type ret_type;
+ enum bpf_arg_type arg1_type;
+ enum bpf_arg_type arg2_type;
+ enum bpf_arg_type arg3_type;
+ enum bpf_arg_type arg4_type;
+ enum bpf_arg_type arg5_type;
+};
+
+/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
+ * the first argument to eBPF programs.
+ * For socket filters: 'struct bpf_context *' == 'struct sk_buff *'
+ */
+struct bpf_context;
+
+enum bpf_access_type {
+ BPF_READ = 1,
+ BPF_WRITE = 2
};

struct bpf_verifier_ops {
/* return eBPF function prototype for verification */
const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
+
+ /* return true if 'size' wide access at offset 'off' within bpf_context
+ * with 'type' (read or write) is allowed
+ */
+ bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
};

struct bpf_prog_type_list {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f03257de2bc3..aea91af1e6fc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,6 +125,72 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+#define _(OP) ({ int ret = (OP); if (ret < 0) return ret; })
+
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+ NOT_INIT = 0, /* nothing was written into register */
+ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
+ PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+ FRAME_PTR, /* reg == frame_pointer */
+ PTR_TO_STACK, /* reg == frame_pointer + imm */
+ CONST_IMM, /* constant integer value */
+};
+
+struct reg_state {
+ enum bpf_reg_type type;
+ union {
+ /* valid when type == CONST_IMM | PTR_TO_STACK */
+ int imm;
+
+ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+ * PTR_TO_MAP_VALUE_OR_NULL
+ */
+ struct bpf_map *map_ptr;
+ };
+};
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* 1st byte of register spilled into stack */
+ STACK_SPILL_PART, /* other 7 bytes of register spill */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+struct bpf_stack_slot {
+ enum bpf_stack_slot_type stype;
+ struct reg_state reg_st;
+};
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_BPF_REG];
+ struct bpf_stack_slot stack[MAX_BPF_STACK];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+ /* verifer state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct verifier_stack_elem *next;
+};
+
#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */

/* single container for all structs
@@ -132,6 +198,9 @@
*/
struct verifier_env {
struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
};
@@ -160,6 +229,45 @@ static void verbose(const char *fmt, ...)
va_end(args);
}

+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (env->cur_state.stack[i].stype == STACK_SPILL)
+ verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ reg_type_str[env->cur_state.stack[i].reg_st.type]);
+ }
+ verbose("\n");
+}
+
static const char *const bpf_class_string[] = {
[BPF_LD] = "ld",
[BPF_LDX] = "ldx",
@@ -305,6 +413,695 @@ static void print_bpf_insn(struct bpf_insn *insn)
}
}

+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ int insn_idx;
+
+ if (env->head == NULL)
+ return -1;
+
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ if (prev_insn_idx)
+ *prev_insn_idx = env->head->prev_insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+ int prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 1024) {
+ verbose("BPF program is too complex\n");
+ goto err;
+ }
+ return &elem->st;
+err:
+ /* pop all elements and return */
+ while (pop_stack(env, NULL) >= 0);
+ return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ regs[i].type = NOT_INIT;
+ regs[i].imm = 0;
+ regs[i].map_ptr = NULL;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = FRAME_PTR;
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].imm = 0;
+ regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+ SRC_OP, /* register is used as source operand */
+ DST_OP, /* register is used as destination operand */
+ DST_OP_NO_MARK /* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ if (regno >= MAX_BPF_REG) {
+ verbose("R%d is invalid\n", regno);
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ struct bpf_stack_slot *slot;
+ int i;
+
+ if (value_regno >= 0 &&
+ (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+ state->regs[value_regno].type == PTR_TO_STACK ||
+ state->regs[value_regno].type == PTR_TO_CTX)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != 8) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+ slot->stype = STACK_SPILL;
+ /* save register state */
+ slot->reg_st = state->regs[value_regno];
+ for (i = 1; i < 8; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->stype = STACK_SPILL_PART;
+ slot->reg_st.type = UNKNOWN_VALUE;
+ slot->reg_st.map_ptr = NULL;
+ }
+ } else {
+
+ /* regular write of data into stack */
+ for (i = 0; i < size; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->stype = STACK_MISC;
+ slot->reg_st.type = UNKNOWN_VALUE;
+ slot->reg_st.map_ptr = NULL;
+ }
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ struct bpf_stack_slot *slot;
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+
+ if (slot->stype == STACK_SPILL) {
+ if (size != 8) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < 8; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_SPILL_PART) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] = slot->reg_st;
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->info->ops->is_valid_access &&
+ env->prog->info->ops->is_valid_access(off, size, t))
+ return 0;
+
+ verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size;
+
+ _(size = bpf_size_to_bytes(bpf_size));
+
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ _(check_map_access(env, regno, off, size));
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == PTR_TO_CTX) {
+ _(check_ctx_access(env, off, size, t));
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == FRAME_PTR) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE)
+ _(check_stack_write(state, off, size, value_regno));
+ else
+ _(check_stack_read(state, off, size, value_regno));
+ } else {
+ verbose("R%d invalid mem access '%s'\n",
+ regno, reg_type_str[state->regs[regno].type]);
+ return -EACCES;
+ }
+ return 0;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check whether atomic_add can read the memory */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1));
+
+ /* check whether atomic_add can write into the same memory */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1));
+ return 0;
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ /* bpf_map_xxx(..., map_ptr, ..., key) call:
+ * check that [key, key + map->key_size) are within
+ * stack limits and initialized
+ */
+ if (!*mapp) {
+ /* in function declaration map_ptr must come before
+ * map_key, so that it's verified and known before
+ * we have to check map_key here. Otherwise it means
+ * that kernel subsystem misconfigured verifier
+ */
+ verbose("invalid map_ptr to access map->key\n");
+ return -EACCES;
+ }
+ _(check_stack_boundary(env, regno, (*mapp)->key_size));
+
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+ /* bpf_map_xxx(..., map_ptr, ..., value) call:
+ * check [value, value + map->value_size) validity
+ */
+ if (!*mapp) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("invalid map_ptr to access map->value\n");
+ return -EACCES;
+ }
+ _(check_stack_boundary(env, regno, (*mapp)->value_size));
+
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ /* bpf_xxx(..., buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ * note: regno == len, regno - 1 == buf
+ */
+ if (regno == 0) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ return -EACCES;
+ }
+ _(check_stack_boundary(env, regno - 1, reg->imm));
+ }
+
+ return 0;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->info->ops->get_func_proto)
+ fn = env->prog->info->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->info->is_gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ /* check args */
+ _(check_func_arg(env, BPF_REG_1, fn->arg1_type, &map));
+ _(check_func_arg(env, BPF_REG_2, fn->arg2_type, &map));
+ _(check_func_arg(env, BPF_REG_3, fn->arg3_type, &map));
+ _(check_func_arg(env, BPF_REG_4, fn->arg4_type, &map));
+ _(check_func_arg(env, BPF_REG_5, fn->arg5_type, &map));
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* update return register */
+ if (fn->ret_type == RET_INTEGER) {
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ } else if (fn->ret_type == RET_VOID) {
+ regs[BPF_REG_0].type = NOT_INIT;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ /* remember map_ptr, so that check_map_access()
+ * can check 'value_size' boundary of memory access
+ * to map element returned from bpf_map_lookup_elem()
+ */
+ if (map == NULL) {
+ verbose("kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = true;
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
/* return the map pointer stored inside BPF_LD_IMM64 instruction */
static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
{
@@ -313,6 +1110,34 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
return (struct bpf_map *) (unsigned long) imm64;
}

+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -492,6 +1317,181 @@ free_st:
return ret;
}

+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(insn);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ _(check_alu_op(regs, insn));
+
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK));
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ _(check_mem_access(env, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg));
+
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ _(check_xadd(env, insn));
+ insn_idx++;
+ continue;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check that memory (dst_reg + off) is writeable */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg));
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check that memory (dst_reg + off) is writeable */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1));
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ _(check_call(env, insn->imm));
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ _(check_reg_arg(regs, BPF_REG_0, SRC_OP));
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ _(check_cond_jmp_op(env, insn, &insn_idx));
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ verbose("LD_ABS is not supported yet\n");
+ return -EINVAL;
+ } else if (mode == BPF_IMM) {
+ _(check_ld_imm(env, insn));
+ insn_idx++;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
@@ -645,9 +1645,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;

- /* ret = do_check(env); */
+ ret = do_check(env);

skip_full_check:
+ while (pop_stack(env, NULL) >= 0);

if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
--
1.7.9.5

Daniel Borkmann

unread,
Sep 10, 2014, 4:10:03 AM9/10/14
to
On 09/10/2014 07:10 AM, Alexei Starovoitov wrote:
> eBPF programs are similar to kernel modules. They are loaded by the user
> process and automatically unloaded when process exits. Each eBPF program is
> a safe run-to-completion set of instructions. eBPF verifier statically
> determines that the program terminates and is safe to execute.
>
> The following syscall wrapper can be used to load the program:
> int bpf_prog_load(enum bpf_prog_type prog_type,
> const struct bpf_insn *insns, int insn_cnt,
> const char *license)
> {
> union bpf_attr attr = {
> .prog_type = prog_type,
> .insns = insns,
> .insn_cnt = insn_cnt,
> .license = license,
> };
>
> return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
> }
> where 'insns' is an array of eBPF instructions and 'license' is a string
> that must be GPL compatible to call helper functions marked gpl_only
>
> Upon succesful load the syscall returns prog_fd.
> Use close(prog_fd) to unload the program.
>
> User space tests and examples follow in the later patches
>
> Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
...
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 4b59edead908..9727616693e5 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -15,6 +15,7 @@
> struct sk_buff;
> struct sock;
> struct seccomp_data;
> +struct bpf_prog_info;
>
> /* ArgX, context and stack frame pointer register positions. Note,
> * Arg1, Arg2, Arg3, etc are used as argument mappings of function
> @@ -302,8 +303,12 @@ struct bpf_work_struct {
> struct bpf_prog {
> u16 pages; /* Number of allocated pages */
> bool jited; /* Is our filter JIT'ed? */
> + bool has_info; /* whether 'info' is valid */
> u32 len; /* Number of filter blocks */
> - struct sock_fprog_kern *orig_prog; /* Original BPF program */
> + union {
> + struct sock_fprog_kern *orig_prog; /* Original BPF program */
> + struct bpf_prog_info *info;
> + };

All members of this bpf_prog_info should go into bpf_work_struct,
as I have intended this to be a ancillary structure here. Since
we already allocate this anyway, you can reduce complexity by doing
the additional allocation plus remove the has_info member.

> struct bpf_work_struct *work; /* Deferred free work struct */
> unsigned int (*bpf_func)(const struct sk_buff *skb,
> const struct bpf_insn *filter);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 3a03fdf4db0e..1d0411965576 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -99,12 +99,23 @@ enum bpf_cmd {
...
> +/* called by sockets/tracing/seccomp before attaching program to an event
> + * pairs with bpf_prog_put()
> + */

But seccomp already does refcounting on each BPF filter. Or, is the
intention to remove this from seccomp?

> +struct bpf_prog *bpf_prog_get(u32 ufd)
> +{
> + struct fd f = fdget(ufd);
> + struct bpf_prog *prog;
> +
> + prog = get_prog(f);
> +
> + if (IS_ERR(prog))
> + return prog;
> +
> + atomic_inc(&prog->info->refcnt);
> + fdput(f);
> + return prog;
> +}
...
> diff --git a/net/core/filter.c b/net/core/filter.c
> index dfc716ffa44b..d771e4f03745 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -835,6 +835,7 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
> {
> struct sock_fprog_kern *fprog = fp->orig_prog;
>
> + BUG_ON(fp->has_info);

Why BUG_ON() (also in so many other places)?

> if (fprog) {
> kfree(fprog->filter);
> kfree(fprog);
> @@ -973,6 +974,7 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
>
> fp->bpf_func = NULL;
> fp->jited = false;
> + fp->has_info = false;
>
> err = bpf_check_classic(fp->insns, fp->len);
> if (err) {
>
--

Daniel Borkmann

unread,
Sep 10, 2014, 4:20:01 AM9/10/14
to
On 09/10/2014 07:09 AM, Alexei Starovoitov wrote:
...
> As requested by Andy and others, here is the man page:
>
> BPF(2) Linux Programmer's Manual BPF(2)
...
> In the future maps can have different types: hash, array, bloom filter,
> radix-tree, but currently only hash type is supported:
> enum bpf_map_type {
> BPF_MAP_TYPE_UNSPEC,
> BPF_MAP_TYPE_HASH,
> };

If we mention them here in the man page, users are going to request
them, naturally. ;) So I'd just mention what we have, not what we do
not yet have. But I'm wondering how much library boiler plate only
for BPF we want to add in future, hopefully not most of what's in
Corman ... ? ;)

Daniel Borkmann

unread,
Sep 10, 2014, 5:10:02 AM9/10/14
to
On 09/10/2014 07:09 AM, Alexei Starovoitov wrote:
...
> struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
> int map_fd;
> void *key;
> union {
> void *value;
> void *next_key;
> };
> };
>
> struct { /* anonymous struct used by BPF_PROG_LOAD command */
> enum bpf_prog_type prog_type;
> __u32 insn_cnt;
> const struct bpf_insn *insns;
> const char *license;
> __u32 log_level; /* verbosity level of eBPF verifier */
> __u32 log_size; /* size of user buffer */
> void *log_buf; /* user supplied buffer */

What is log buffer? Would that mean the verifier will return an error
string if the program will not pass it, or if not, what other data?
I think the man page is missing how to examine the returned verifier
log buffer data.

Daniel Borkmann

unread,
Sep 10, 2014, 5:30:01 AM9/10/14
to
On 09/10/2014 07:09 AM, Alexei Starovoitov wrote:
....
> BPF(2) Linux Programmer's Manual BPF(2)
...
> union bpf_attr {
> struct { /* anonymous struct used by BPF_MAP_CREATE command */
> enum bpf_map_type map_type;
> __u32 key_size; /* size of key in bytes */
> __u32 value_size; /* size of value in bytes */
> __u32 max_entries; /* max number of entries in a map */
> };
>
> struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
> int map_fd;
> void *key;
> union {
> void *value;
> void *next_key;
> };

When you pass in these structs with pointers in it to other user space
buffers, how do you handle this with mixed 32/64 bit user/kernel space?

As an example, for the current way to load BPF although we export ...

struct sock_fprog {
unsigned short len;
struct sock_filter __user *filter;
};

... through uapi, we still need to handle this via compat_sock_fprog
to take care of different pointer sizes via compat_uptr_t :

#ifdef CONFIG_COMPAT
struct compat_sock_fprog {
u16 len;
compat_uptr_t filter;
};
#endif

Perhaps I'm missing something, but I think, that would currently break in
your syscall handler, no?

> };
>
> struct { /* anonymous struct used by BPF_PROG_LOAD command */
> enum bpf_prog_type prog_type;
> __u32 insn_cnt;
> const struct bpf_insn *insns;
> const char *license;
> __u32 log_level; /* verbosity level of eBPF verifier */
> __u32 log_size; /* size of user buffer */
> void *log_buf; /* user supplied buffer */
> };
> };

Daniel Borkmann

unread,
Sep 10, 2014, 7:30:01 AM9/10/14
to
On 09/10/2014 07:10 AM, Alexei Starovoitov wrote:
> move instruction macros (like BPF_MOV64_REG or BPF_ALU32_IMM)
> from linux/filter.h into uapi/linux/bpf.h
> so that userspace programs can use them.
>
> verifier testsuite (in later patches) will be using them.
>
> Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>

I don't think we need this commit at all. These macros are not really
part of the ABI and we should expose __as less as possible__, otherwise
we won't be able to alter them anymore. Plus, based on your previous
argumentation regarding the header file expose, we also won't be able
to add any new macros anymore since you could run into name collisions.

Anyway, I don't see a reason why a e.g. central user space eBPF library
cannot live with, for example, a private copy of these helper macros or
whatever it wants to define by itself.

I know in classic BPF there's BPF_STMT() and BPF_JUMP() but it's much
less complicated than eBPF and less likely to changes.

Daniel Borkmann

unread,
Sep 10, 2014, 7:40:02 AM9/10/14
to
On 09/10/2014 07:10 AM, Alexei Starovoitov wrote:
Since we already have an extensive BPF test suite, that is, lib/test_bpf.c,
which currently also does sanity checks for the classic BPF verifier, is
there a reason these verifier test cases cannot be extended/integrated there
as well but have to go to kernel/bpf/test_stub.c resp. samples/bpf/test_verifier.c ?
I don't like that we put testing code into kernel/bpf/ whereas we already
have a BPF test infrastructure in the kernel elsewhere.

Alexei Starovoitov

unread,
Sep 10, 2014, 1:20:01 PM9/10/14
to
On Wed, Sep 10, 2014 at 1:04 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>> + bool has_info; /* whether 'info' is valid
>> u32 len; /* Number of filter blocks
>> - struct sock_fprog_kern *orig_prog; /* Original BPF program */
>> + union {
>> + struct sock_fprog_kern *orig_prog; /* Original BPF
>> program */
>> + struct bpf_prog_info *info;
>> + };
>
>
> All members of this bpf_prog_info should go into bpf_work_struct,
> as I have intended this to be a ancillary structure here. Since
> we already allocate this anyway, you can reduce complexity by doing
> the additional allocation plus remove the has_info member.

that's doable, but won't you be worried about extra 6 fields
in there that only used by native eBPF programs?
I kept them separate not to introduce any overhead
to classic programs.

>> struct bpf_work_struct *work; /* Deferred free work
>> struct */

Also we'd need to rename it, adjust the comment above
and move into linux/bpf.h, since it don't want to overload
linux/filter.h with native eBPF stuff. In my mind filter.h
is for classic and socket things, whereas bpf.h is net-less.
So I'm 50/50 on this one.
Dropping has_info flag is definitely a plus.

Rename 'struct bpf_work_struct' to 'struct bpf_prog_info' ?
bpf_prog_aux_data? bpf_prog_extra ?
Naming is hard.

> But seccomp already does refcounting on each BPF filter. Or, is the
> intention to remove this from seccomp?

seccomp refcounts its own wrapper struct on top of classic bpf.
It gets incremented when task is forked, so I suspect it will
be needed even when seccomp moves to native.
Note that 'struct sk_filter' has its own refcnt as well which
gets incremented when socket is cloned. It's independent from
eBPF program refcnt which is part of 'struct bpf_prog_info',
since the same eBPF program can be attached to multiple
sockets. So both are needed. Seccomp may want to attach
the same eBPF program to multiple tasks as well to save
some memory.
In classic BPF we may open multiple sockets and attach
the same classic BPF prog to all. prog is allocated multiple
times. JIT is called multiple times, but overhead is not huge.
For eBPF is not an option. In eBPF+tracing single program
may be attached to hundreds of events.

>> + BUG_ON(fp->has_info);
>
> Why BUG_ON() (also in so many other places)?

because struct bpf_prog can be created in many different
ways and I had a painful bug here while developing.
I think it can be dropped now.

Alexei Starovoitov

unread,
Sep 10, 2014, 1:30:02 PM9/10/14
to
On Wed, Sep 10, 2014 at 1:19 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>>
>> In the future maps can have different types: hash, array, bloom
>> filter,
>> radix-tree, but currently only hash type is supported:
>> enum bpf_map_type {
>> BPF_MAP_TYPE_UNSPEC,
>> BPF_MAP_TYPE_HASH,
>> };
>
>
> If we mention them here in the man page, users are going to request
> them, naturally. ;) So I'd just mention what we have, not what we do
> not yet have. But I'm wondering how much library boiler plate only
> for BPF we want to add in future, hopefully not most of what's in
> Corman ... ? ;)

:)
as cover letter says hash type was moved from this set to
later series, so it's implemented.
array and radix-tree are implemented as well, but not ready for
review yet. I have strong use cases for them.
Even two of the simplest examples for tracing (ex2 and ex3)
will benefit from 'array' type.
For bloomfiler type Chema had a strong use case as well.
I was assuming he will implement it sooner or later ;)
But, surely, I can drop future features from the manpage.

Alexei Starovoitov

unread,
Sep 10, 2014, 1:40:02 PM9/10/14
to
On Wed, Sep 10, 2014 at 2:03 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>> struct { /* anonymous struct used by BPF_PROG_LOAD command */
>> enum bpf_prog_type prog_type;
>> __u32 insn_cnt;
>> const struct bpf_insn *insns;
>> const char *license;
>> __u32 log_level; /* verbosity level of
>> eBPF verifier */
>> __u32 log_size; /* size of user buffer */
>> void *log_buf; /* user supplied buffer
>> */
>
>
> What is log buffer? Would that mean the verifier will return an error
> string if the program will not pass it, or if not, what other data?
> I think the man page is missing how to examine the returned verifier
> log buffer data.

yes. it's an error log (as text string for humans) from verifier.
It is briefly explained in the man page:
EACCES For BPF_PROG_LOAD, though program has valid instructions, it was
rejected, since it was deemed unsafe (may access disallowed mem-
ory region or uninitialized stack/register or function con-
straints don't match actual types or misaligned access). In such
case it is recommended to call bpf() again with log_level = 1
and examine log_buf for specific reason provided by verifier.
and there is a whole section about it in
Documentation/networking/filter.txt
called "Understanding eBPF verifier messages".

Alexei Starovoitov

unread,
Sep 10, 2014, 1:50:02 PM9/10/14
to
On Wed, Sep 10, 2014 at 2:21 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>
> When you pass in these structs with pointers in it to other user space
> buffers, how do you handle this with mixed 32/64 bit user/kernel space?
...
> Perhaps I'm missing something, but I think, that would currently break in
> your syscall handler, no?

yes. compat stuff is not part of this diff.
I was thinking to handle it similar to the way signalfd deals with it:
add compat syscall, compat struct and convert it before going into
main entry point.

Alexei Starovoitov

unread,
Sep 10, 2014, 2:10:01 PM9/10/14
to
On Wed, Sep 10, 2014 at 4:35 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>
> Since we already have an extensive BPF test suite, that is, lib/test_bpf.c,
> which currently also does sanity checks for the classic BPF verifier, is
> there a reason these verifier test cases cannot be extended/integrated there
> as well but have to go to kernel/bpf/test_stub.c resp.
> samples/bpf/test_verifier.c ?
> I don't like that we put testing code into kernel/bpf/ whereas we already
> have a BPF test infrastructure in the kernel elsewhere.

yes. there is a reason. Verifier needs to be tested from user space,
since it works on fds. Process local map_fd are part of the eBPF
programs. Therefore one is testing things from kernel and
another from userspace. We definitely need both.
Currently there is no use case to call verifier from inside
the kernel. I'm not sure there will be one. Verifier's main
purpose is to check user supplied programs and provide
humans an understandable error messages of what
is 'unsafe' in particular program.
Eventually we will integrate this verifier messages with
program compilation. Like, the user would write a program
in C then invoke a wrapper of compiler and verifier, which
will point to lines in C code which are doing something
wrong like loops or out of bounds access. Currently verifier
complains about particular 'unsafe' instruction, but
humans have hard time correlating asm to C.

Alexei Starovoitov

unread,
Sep 10, 2014, 2:20:01 PM9/10/14
to
On Wed, Sep 10, 2014 at 4:24 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
> On 09/10/2014 07:10 AM, Alexei Starovoitov wrote:
>>
>> move instruction macros (like BPF_MOV64_REG or BPF_ALU32_IMM)
>> from linux/filter.h into uapi/linux/bpf.h
>> so that userspace programs can use them.
>>
>> verifier testsuite (in later patches) will be using them.
>>
>> Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
>
>
> I don't think we need this commit at all. These macros are not really
> part of the ABI and we should expose __as less as possible__, otherwise
> we won't be able to alter them anymore. Plus, based on your previous
> argumentation regarding the header file expose, we also won't be able
> to add any new macros anymore since you could run into name collisions.
>
> Anyway, I don't see a reason why a e.g. central user space eBPF library
> cannot live with, for example, a private copy of these helper macros or
> whatever it wants to define by itself.
>
> I know in classic BPF there's BPF_STMT() and BPF_JUMP() but it's much
> less complicated than eBPF and less likely to changes.

Interesting idea. Are you saying just copy paste these
226 lines into user libbpf.h for now to use them in testsuite
and examples ? Hmm.
Indeed the less lines we add to uapi the better. I'll do that.

Andy Lutomirski

unread,
Sep 10, 2014, 2:30:02 PM9/10/14
to
I find this strange. Why not just make attr be a pointer to the
relevant struct for the operation being invoked?


>
> union bpf_attr {
> struct { /* anonymous struct used by BPF_MAP_CREATE command */
> enum bpf_map_type map_type;

Does this reliably generate the same type on compat systems? C++11
has a fix for enum ABI compatibility, but this is plain C :(


> struct { /* anonymous struct used by BPF_PROG_LOAD command */
> enum bpf_prog_type prog_type;
> __u32 insn_cnt;
> const struct bpf_insn *insns;
> const char *license;
> __u32 log_level; /* verbosity level of eBPF verifier */
> __u32 log_size; /* size of user buffer */
> void *log_buf; /* user supplied buffer */
> };
> };

It might be a bit nicer to have separate in and out arguments.


>
> BPF_MAP_CREATE
> int bpf_create_map(enum bpf_map_type map_type, int key_size,
> int value_size, int max_entries)
> {
> union bpf_attr attr = {
> .map_type = map_type,
> .key_size = key_size,
> .value_size = value_size,
> .max_entries = max_entries
> };

I feel like this is asking for trouble, or at least bizarre namespace
collisions in the anonymous struct members. At least please give the
structs names. (Also, the first time I read this, I assumed that
those were union members, which would have made the code be nonsense.)

>
> BPF_MAP_DELETE_ELEM
> int bpf_delete_elem(int fd, void *key)
> {
> union bpf_attr attr = {
> .map_fd = fd,
> .key = key,
> };
>
> return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
> }
> The call deletes an element in a map fd with given key.

What does it return? (The same question goes for a bunch of the map ops.)

>
> eBPF programs
> BPF_PROG_LOAD
> This cmd is used to load eBPF program into the kernel.
>
> char bpf_log_buf[LOG_BUF_SIZE];

What happens if the size isn't LOG_BUF_SIZE?

>
> int bpf_prog_load(enum bpf_prog_type prog_type,
> const struct bpf_insn *insns, int insn_cnt,
> const char *license)
> {
> union bpf_attr attr = {
> .prog_type = prog_type,
> .insns = insns,
> .insn_cnt = insn_cnt,
> .license = license,
> .log_buf = bpf_log_buf,
> .log_size = LOG_BUF_SIZE,
> .log_level = 1,
> };
>
> return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
> }
> prog_type one of the available program types:
> enum bpf_prog_type {
> BPF_PROG_TYPE_UNSPEC,
> BPF_PROG_TYPE_SOCKET_FILTER,
> BPF_PROG_TYPE_TRACING_FILTER,
> };

Why does the type matter?
This (the .imm thing) is sufficiently weird that I think it needs to
be mentioned in the main docs, not just in an example. It's
especially odd since AFAIK essentially every other object format in
the world uses a separate relocation table instead of inline magic
opcodes like this.

>
> All other commands
> Zero.

Shouldn't delete return different values depending on whether anything
was deleted?

>
> ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM, indicates that
> element with given key was not found.

Ah, here it is. Please document this with the ops.

>
> E2BIG program is too large.
>
> NOTES
> These commands may be used only by a privileged process (one having the
> CAP_SYS_ADMIN capability).

I hope this goes away :)

I can't shake the feeling that the whole syscall map API is wrong and
that, instead, there should be a more general concept of objects
provided by the eBPF runtime. Those objects could have methods that
are callable by the syscall and callable from eBPF code.

--Andy

Alexei Starovoitov

unread,
Sep 10, 2014, 4:30:02 PM9/10/14
to
On Wed, Sep 10, 2014 at 11:22 AM, Andy Lutomirski <lu...@amacapital.net> wrote:
>>
>> attr is a pointer to a union of type bpf_attr as defined below.
>>
>> size is the size of the union.
>
> I find this strange. Why not just make attr be a pointer to the
> relevant struct for the operation being invoked?

you mean change attr to be 'void *' and type cast it to particular
struct type based on cmd ? Possible, but I tried to avoid all
typecasts as Dave doesn't like them.

>> union bpf_attr {
>> struct { /* anonymous struct used by BPF_MAP_CREATE command */
>> enum bpf_map_type map_type;
>
> Does this reliably generate the same type on compat systems? C++11
> has a fix for enum ABI compatibility, but this is plain C :(

enum is int on both 32 and 64-bit. What was the concern?
anonymous struct ?
I've checked that with gcc 4.2 - 4.9 and clang. All was fine
and it's part of C standard now.

>> struct { /* anonymous struct used by BPF_PROG_LOAD command */
>> enum bpf_prog_type prog_type;
>> __u32 insn_cnt;
>> const struct bpf_insn *insns;
>> const char *license;
>> __u32 log_level; /* verbosity level of eBPF verifier */
>> __u32 log_size; /* size of user buffer */
>> void *log_buf; /* user supplied buffer */
>> };
>> };
>
> It might be a bit nicer to have separate in and out arguments.

would do you mean specifically?
const pointer is obviously 'in' argument.
'void *' is 'out'.

>> int bpf_create_map(enum bpf_map_type map_type, int key_size,
>> int value_size, int max_entries)
>> {
>> union bpf_attr attr = {
>> .map_type = map_type,
>> .key_size = key_size,
>> .value_size = value_size,
>> .max_entries = max_entries
>> };
>
> I feel like this is asking for trouble, or at least bizarre namespace
> collisions in the anonymous struct members. At least please give the
> structs names. (Also, the first time I read this, I assumed that
> those were union members, which would have made the code be nonsense.)

if inner struct types had names they would need to
have field names as well, so the syscall wrapper above
would become much more verbose and uglier.
Also naming structs may give wrong ideas to some users,
since they might think it's ok to init struct only and type cast
it to pass into syscall. When inner structs don't have names,
the user space is forced to always use 'union bpf_attr' and
initialize relevant fields.

>> char bpf_log_buf[LOG_BUF_SIZE];
>
> What happens if the size isn't LOG_BUF_SIZE?

would do you mean?
LOG_BUF_SIZE is just a user defined macro.
Can be anything.
it's passed along with pointer:
.log_buf = bpf_log_buf,
.log_size = LOG_BUF_SIZE,
.log_level = 1,

>> enum bpf_prog_type {
>> BPF_PROG_TYPE_UNSPEC,
>> BPF_PROG_TYPE_SOCKET_FILTER,
>> BPF_PROG_TYPE_TRACING_FILTER,
>> };
>
> Why does the type matter?

type is way to tell eBPF infra what this type of programs
is allowed to do. Different kernel subsystems
configure different types.
Like patch 12 configures TYPE_UNSPEC for testing.
This type allows one dummy function call and
bpf_context of two u64 fields.
tracing subsystem will configure TYPE_TRACING
to do different set of helper functions and different
body of 'bpf_context'.
PROG_TYPE and MAP_TYPE are two main ways
to configure eBPF infra for different use cases.

> This (the .imm thing) is sufficiently weird that I think it needs to
> be mentioned in the main docs, not just in an example. It's
> especially odd since AFAIK essentially every other object format in
> the world uses a separate relocation table instead of inline magic
> opcodes like this.

we discussed relocations before, right? ;)
I believe relocations are ugly. elf has no other way to deal
with it, since .text has valid cpu instructions and generic
loader has to adjust them without knowing hw encoding.
Here we have pseudo instructions that are much easier
to check/track in verifier than relocations.
As you remember in previous series I've tried relocation
style and it was ugly. Both as user interface and as extra
complexity for verifier.
Does commit log of patch 8 explain map_fd conversion
well enough or not?
If not, I'll add more info, but please read it first.

>> ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM, indicates that
>> element with given key was not found.
>
> What does it return? (The same question goes for a bunch of the map ops.)
...
> Shouldn't delete return different values depending on whether anything
> was deleted?
...
> Ah, here it is. Please document this with the ops.

I believe it's a standard manpage style to document
return values at the end in 'return value' section.
I can duplicate it with ops, but is it really necessary?

>> These commands may be used only by a privileged process (one having the
>> CAP_SYS_ADMIN capability).
>
> I hope this goes away :)

hehe.
I think folks obsessed with security will say it should stay
this way for looooong time :)
My immediate goal is tracing and there this restriction is
necessary anyway.

> I can't shake the feeling that the whole syscall map API is wrong and
> that, instead, there should be a more general concept of objects
> provided by the eBPF runtime. Those objects could have methods that
> are callable by the syscall and callable from eBPF code.

'concept of objects'... sounds abstractly good :)
Something concrete you have in mind?

Theoretically I can see how we can add a 'stream' object
which user space can read and programs feed stuff to.
In other words an 'abstract' trace buffer, but imo it's
overdesign. When we need a trace buffer, we'll just
add a helper function that pushes stuff to it.
That will be the time when you see how handy pseudo
instructions are. In this patch the only '.imm thing', as you
say, is map_fd. I'm working on per-cpu local buffer via
the same pseudo stuff. Seem to work quite nicely.
Let's not get carried on with future cool stuff, basics first :)

Thanks for the feedback!

Daniel Borkmann

unread,
Sep 11, 2014, 2:40:02 AM9/11/14
to
On 09/10/2014 08:16 PM, Alexei Starovoitov wrote:
> On Wed, Sep 10, 2014 at 4:24 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>> On 09/10/2014 07:10 AM, Alexei Starovoitov wrote:
>>>
>>> move instruction macros (like BPF_MOV64_REG or BPF_ALU32_IMM)
>>> from linux/filter.h into uapi/linux/bpf.h
>>> so that userspace programs can use them.
>>>
>>> verifier testsuite (in later patches) will be using them.
>>>
>>> Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
>>
>> I don't think we need this commit at all. These macros are not really
>> part of the ABI and we should expose __as less as possible__, otherwise
>> we won't be able to alter them anymore. Plus, based on your previous
>> argumentation regarding the header file expose, we also won't be able
>> to add any new macros anymore since you could run into name collisions.
>>
>> Anyway, I don't see a reason why a e.g. central user space eBPF library
>> cannot live with, for example, a private copy of these helper macros or
>> whatever it wants to define by itself.
>>
>> I know in classic BPF there's BPF_STMT() and BPF_JUMP() but it's much
>> less complicated than eBPF and less likely to changes.
>
> Interesting idea. Are you saying just copy paste these
> 226 lines into user libbpf.h for now to use them in testsuite
> and examples ? Hmm.
> Indeed the less lines we add to uapi the better. I'll do that.

Yes, that should be just fine, the existing ABI is not allowed to
change anyway and we're running into less pain if we decide to change
existing or add new macros internally.

Daniel Borkmann

unread,
Sep 11, 2014, 3:50:01 PM9/11/14
to
On 09/10/2014 07:32 PM, Alexei Starovoitov wrote:
> On Wed, Sep 10, 2014 at 2:03 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>>> struct { /* anonymous struct used by BPF_PROG_LOAD command */
>>> enum bpf_prog_type prog_type;
>>> __u32 insn_cnt;
>>> const struct bpf_insn *insns;
>>> const char *license;
>>> __u32 log_level; /* verbosity level of
>>> eBPF verifier */
>>> __u32 log_size; /* size of user buffer */
>>> void *log_buf; /* user supplied buffer
>>> */
>>
>>
>> What is log buffer? Would that mean the verifier will return an error
>> string if the program will not pass it, or if not, what other data?
>> I think the man page is missing how to examine the returned verifier
>> log buffer data.
>
> yes. it's an error log (as text string for humans) from verifier.

I was confused due to the void pointer. But that also means that the text
string becomes part of the ABI; aren't eBPF specific error codes (perhaps
a tuple of [line + error code]), though ugly as well, but perhaps the better
solution to this [which user space can then map to an actual string]?

Daniel Borkmann

unread,
Sep 11, 2014, 4:00:01 PM9/11/14
to
On 09/10/2014 10:21 PM, Alexei Starovoitov wrote:
...
>>> char bpf_log_buf[LOG_BUF_SIZE];
>>
>> What happens if the size isn't LOG_BUF_SIZE?
>
> would do you mean?
> LOG_BUF_SIZE is just a user defined macro.
> Can be anything.

I believe, Andy means, what would happen if log_level > 0 but
the log_size is not big enough so that the human-readable error
text string from the verifier will fit into it? I presume that
will be truncated then ... hm.

> it's passed along with pointer:
> .log_buf = bpf_log_buf,
> .log_size = LOG_BUF_SIZE,
> .log_level = 1,

Alexei Starovoitov

unread,
Sep 11, 2014, 4:40:02 PM9/11/14
to
On Thu, Sep 11, 2014 at 12:54 PM, Daniel Borkmann <dbor...@redhat.com> wrote:
> On 09/10/2014 10:21 PM, Alexei Starovoitov wrote:
> ...
>>>>
>>>> char bpf_log_buf[LOG_BUF_SIZE];
>>>
>>>
>>> What happens if the size isn't LOG_BUF_SIZE?
>>
>>
>> would do you mean?
>> LOG_BUF_SIZE is just a user defined macro.
>> Can be anything.
>
> I believe, Andy means, what would happen if log_level > 0 but
> the log_size is not big enough so that the human-readable error
> text string from the verifier will fit into it? I presume that
> will be truncated then ... hm.

Correct. It will be truncated and ENOSPC returned from syscall.
Just noticed that I didn't mention that in the manpage... will do.

Alexei Starovoitov

unread,
Sep 11, 2014, 4:40:02 PM9/11/14
to
On Thu, Sep 11, 2014 at 12:47 PM, Daniel Borkmann <dbor...@redhat.com> wrote:
> On 09/10/2014 07:32 PM, Alexei Starovoitov wrote:
>>
>> On Wed, Sep 10, 2014 at 2:03 AM, Daniel Borkmann <dbor...@redhat.com>
>> wrote:
>>>>
>>>> struct { /* anonymous struct used by BPF_PROG_LOAD command
>>>> */
>>>> enum bpf_prog_type prog_type;
>>>> __u32 insn_cnt;
>>>> const struct bpf_insn *insns;
>>>> const char *license;
>>>> __u32 log_level; /* verbosity level of
>>>> eBPF verifier */
>>>> __u32 log_size; /* size of user buffer
>>>> */
>>>> void *log_buf; /* user supplied
>>>> buffer
>>>> */
>>>
>>>
>>>
>>> What is log buffer? Would that mean the verifier will return an error
>>> string if the program will not pass it, or if not, what other data?
>>> I think the man page is missing how to examine the returned verifier
>>> log buffer data.
>>
>>
>> yes. it's an error log (as text string for humans) from verifier.
>
> I was confused due to the void pointer. But that also means that the text

ahh. ok. will change it to 'char *' then.

> string becomes part of the ABI; aren't eBPF specific error codes (perhaps
> a tuple of [line + error code]), though ugly as well, but perhaps the better
> solution to this [which user space can then map to an actual string]?

the verifier log contains full trace. Last unsafe instruction + error
in many cases is useless. What we found empirically from using
it over last 2 years is that developers have different learning curve
to adjust to 'safe' style of C. Pretty much everyone couldn't
figure out why program is rejected based on last error. Therefore
verifier emits full log. From the 1st insn all the way till the last
'unsafe' instruction. So the log is multiline output.
'Understanding eBPF verifier messages' section of
Documentation/networking/filter.txt provides few trivial
examples of these multiline messages.
Like for the program:
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
BPF_EXIT_INSN(),
the verifier log_buf is:
0: (7a) *(u64 *)(r10 -8) = 0
1: (bf) r2 = r10
2: (07) r2 += -8
3: (b7) r1 = 0
4: (85) call 1
5: (15) if r0 == 0x0 goto pc+1
R0=map_ptr R10=fp
6: (7a) *(u64 *)(r0 +4) = 0
misaligned access off 4 size 8

It will surely change over time as verifier becomes smarter,
supports new types, optimizations and so on.
So this log is not an ABI. It's for humans to read.
The log explains _how_ verifier came to conclusion
that the program is unsafe.

Andy Lutomirski

unread,
Sep 11, 2014, 6:00:02 PM9/11/14
to
Given that you've already arranged (I think) for the verifier to be
compilable in the kernel and in userspace, would it make more sense to
have the kernel version just say yes or no and to make it easy for
user code to retry verification in userspace if they want a full
explanation?

--Andy

Alexei Starovoitov

unread,
Sep 11, 2014, 6:30:02 PM9/11/14
to
Good memory :) Long ago I had a hack where I compiled
verifier.o for kernel and linked it with userspace wrappers to
have the same verifier for userspace. It was very fragile.
and maps were not separate objects and there were no fds.
It's not feasible anymore, since different subsystems
will configure different bpf_context and helper functions and
verifier output is dynamic based on maps that were created.
For example, if user's samples/bpf/sock_example.c does
bpf_create_map(HASH, sizeof(key) * 2, ...);
instead of
bpf_create_map(HASH, sizeof(key), ...);
the same program will be rejected in first case and will be
accepted in the second, because map sizes and ebpf
program expectations are mismatching.
For the 1st case verifier will complain that program is
trying to pass uninitialized stack into bpf_lookup(key,...)
method or stack may be out of bounds.
Human insight is needed to understand what is going on.

I think more important is that source of truth needs to be
in one place == kernel. If we have two verifiers they will
diverge sooner or later and cause confusion for users.
I think as long as we document that verifier log
messages are not cast in stone and will change, we're fine.
I consider them as continuation of compiler warnings/errors.
They are meant for humans and do change from time to time.

Andy Lutomirski

unread,
Sep 11, 2014, 9:20:02 PM9/11/14
to
Hmm.

This actually furthers my thought that the relocations should be a
real relocation table. Then you could encode the types of the
referenced objects in the table, and a program could be verified
without looking up the fds. The only extra step would be to confirm
that the actual types referenced match those in the table.

--Andy

Alexei Starovoitov

unread,
Sep 11, 2014, 9:30:02 PM9/11/14
to
It's not the type is being checked, but one particular map instance
with user specified key/value sizes. type is not helpful. type is not
even used during verification. Only key_size and value_size of
elements are meaningful and they're looked up dynamically by fd.

Alexei Starovoitov

unread,
Sep 12, 2014, 6:50:01 PM9/12/14
to
Hi All,

the list of things I fixed so far from V11:
- dropped patch 11 and copied few macros to libbpf.h (suggested by Daniel)
- replaced 'enum bpf_prog_type' with u32 to be safe in compat (.. Andy)
- implemented and tested compat support (.. Daniel)
- changed 'void *log_buf' to 'char *' (.. Daniel)
- combined struct bpf_work_struct and bpf_prog_info (.. Daniel)
- added better return value explanation to manpage (.. Andy)
- added log_buf/log_size explanation to manpage (.. Andy & Daniel)
- added a lot more info about prog_type and map_type and
they relation to verifier (.. Andy)

anything else I missed from the discussion we had?

Here is updated manpage. Please take a look.

BPF(2) Linux Programmer's Manual BPF(2)



NAME
bpf - perform a command on eBPF map or program

SYNOPSIS
#include <linux/bpf.h>

int bpf(int cmd, union bpf_attr *attr, unsigned int size);


DESCRIPTION
bpf() syscall is a multiplexor for a range of different operations on
eBPF which can be characterized as "universal in-kernel virtual
machine". eBPF is similar to original Berkeley Packet Filter (or
"classic BPF") used to filter network packets. Both statically analyze
the programs before loading them into the kernel to ensure that
programs cannot harm the running system.

eBPF extends classic BPF in multiple ways including ability to call in-
kernel helper functions and access shared data structures like eBPF
maps. The programs can be written in a restricted C that is compiled
into eBPF bytecode and executed on the eBPF virtual machine or JITed
into native instruction set.

eBPF Design/Architecture
eBPF maps is a generic storage of different types. User process can
create multiple maps (with key/value being opaque bytes of data) and
access them via file descriptor. In parallel eBPF programs can access
maps from inside the kernel. It's up to user process and eBPF program
to decide what they store inside maps.

eBPF programs are similar to kernel modules. They are loaded by the
user process and automatically unloaded when process exits. Each eBPF
program is a safe run-to-completion set of instructions. eBPF verifier
statically determines that the program terminates and is safe to
execute. During verification the program takes a hold of maps that it
element

BPF_PROG_LOAD
Verify and load eBPF program

attr is a pointer to a union of type bpf_attr as defined below.

size is the size of the union.

union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type;
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
};

struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
int map_fd;
void *key;
union {
void *value;
void *next_key;
};
};

struct { /* anonymous struct used by BPF_PROG_LOAD command */
__u32 prog_type;
__u32 insn_cnt;
const struct bpf_insn *insns;
const char *license;
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
char *log_buf; /* user supplied buffer */
};
};

eBPF maps
maps is a generic storage of different types for sharing data between
kernel and userspace.

Any map type has the following attributes:
. type
. max number of elements
. key size in bytes
. value size in bytes

The following wrapper functions demonstrate how this syscall can be
used to access the maps. The functions use the cmd argument to invoke
different operations.

BPF_MAP_CREATE
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};

return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
bpf() syscall creates a map of map_type type and given
attributes key_size, value_size, max_entries. On success it
returns process-local file descriptor. On error, -1 is returned
and errno is set to EINVAL or EPERM or ENOMEM.

The attributes key_size and value_size will be used by verifier
during program loading to check that program is calling
bpf_map_*_elem() helper functions with correctly initialized key
and that program doesn't access map element value beyond
specified value_size. For example, when map is created with
key_size = 8 and program does:
bpf_map_lookup_elem(map_fd, fp - 4)
such program will be rejected, since in-kernel helper function
bpf_map_lookup_elem(map_fd, void *key) expects to read 8 bytes
from 'key' pointer, but 'fp - 4' starting address will cause out
of bounds stack access.

Similarly, when map is created with value_size = 1 and program
does:
value = bpf_map_lookup_elem(...);
*(u32 *)value = 1;
such program will be rejected, since it accesses value pointer
beyond specified 1 byte value_size limit.

Currently only hash table map_type is supported:
enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
BPF_MAP_TYPE_HASH,
};
map_type selects one of the available map implementations in
kernel. Regardless of type eBPF program accesses maps with the
same bpf_map_lookup_elem()/bpf_map_update_elem() helper
functions.

BPF_MAP_LOOKUP_ELEM
int bpf_lookup_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
.value = value,
};

return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
bpf() syscall looks up an element with given key in a map fd.
If element is found it returns zero and stores element's value
into value. If element is not found it returns -1 and sets
errno to ENOENT.

BPF_MAP_UPDATE_ELEM
int bpf_update_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
.value = value,
};

return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
The call creates or updates element with given key/value in a
map fd. On success it returns zero. On error, -1 is returned
and errno is set to EINVAL or EPERM or ENOMEM or E2BIG. E2BIG
indicates that number of elements in the map reached max_entries
limit specified at map creation time.

BPF_MAP_DELETE_ELEM
int bpf_delete_elem(int fd, void *key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
};

return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
}
The call deletes an element in a map fd with given key. Returns
zero on success. If element is not found it returns -1 and sets
errno to ENOENT.

BPF_MAP_GET_NEXT_KEY
int bpf_get_next_key(int fd, void *key, void *next_key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = key,
.next_key = next_key,
};

return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
}
The call looks up an element by key in a given map fd and
returns key of the next element into next_key pointer. If key is
not found, it return zero and returns key of the first element
into next_key. If key is the last element, it returns -1 and
sets errno to ENOENT. Other possible errno values are ENOMEM,
EFAULT, EPERM, EINVAL. This method can be used to iterate over
all elements of the map.

close(map_fd)
will delete the map map_fd. Exiting process will delete all
maps automatically.

eBPF programs
BPF_PROG_LOAD
This cmd is used to load eBPF program into the kernel.

char bpf_log_buf[LOG_BUF_SIZE];

int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_cnt,
const char *license)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = insns,
.insn_cnt = insn_cnt,
.license = license,
.log_buf = bpf_log_buf,
.log_size = LOG_BUF_SIZE,
.log_level = 1,
};

return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}
prog_type is one of the available program types:
enum bpf_prog_type {
BPF_PROG_TYPE_UNSPEC,
BPF_PROG_TYPE_SOCKET,
BPF_PROG_TYPE_TRACING,
};
By picking prog_type program author selects a set of helper
functions callable from eBPF program and corresponding format of
struct bpf_context (which is the data blob passed into the
program as the first argument). For example, the programs
loaded with prog_type = TYPE_TRACING may call bpf_printk()
helper, whereas TYPE_SOCKET programs may not. The set of
functions available to the programs under given type may
increase in the future.

Currently the set of functions for TYPE_TRACING is:
bpf_map_lookup_elem(map_fd, void *key) // lookup key in a map_fd
bpf_map_update_elem(map_fd, void *key, void *value) // update key/value
bpf_map_delete_elem(map_fd, void *key) // delete key in a map_fd
bpf_ktime_get_ns(void) // returns current ktime
bpf_printk(char *fmt, int fmt_size, ...) // prints into trace buffer
bpf_get_current(void) // return current task pointer
bpf_memcmp(void *ptr1, void *ptr2, int size) // non-faulting memcmp
bpf_fetch_ptr(void *ptr) // non-faulting load pointer from any address
bpf_fetch_u8(void *ptr) // non-faulting 1 byte load
bpf_fetch_u16(void *ptr) // other non-faulting loads
bpf_fetch_u32(void *ptr)
bpf_fetch_u64(void *ptr)

and bpf_context is defined as:
struct bpf_context {
/* argN fields match one to one to arguments passed to trace events */
u64 arg1, arg2, arg3, arg4, arg5, arg6;
/* return value from kretprobe event or from syscall_exit event */
u64 ret;
};

The set of helper functions for TYPE_SOCKET is TBD.

More program types may be added in the future. Like
BPF_PROG_TYPE_USER_TRACING for unprivileged programs.

insns array of "struct bpf_insn" instructions

insn_cnt number of instructions in the program

license license string, which must be GPL compatible to call
helper functions marked gpl_only

log_buf user supplied buffer that in-kernel verifier is using to
store verification log. Log is a multi-line string that should
be used by program author to understand how verifier came to
conclusion that program is unsafe. The format of the output can
change at any time as verifier evolves.

log_size size of user buffer. If size of the buffer is not large
enough to store all verifier messages, -1 is returned and errno
is set to ENOSPC.
int sock, map_fd, prog_fd, key;
long long value = 0, tcp_cnt, udp_cnt;

map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2);
if (map_fd < 0) {
printf("failed to create map '%s'\n", strerror(errno));
/* likely not run as root */
return 1;
}

key = 6; /* ip->proto == tcp */
assert(bpf_update_elem(map_fd, &key, &value) == 0);

key = 17; /* ip->proto == udp */
assert(bpf_update_elem(map_fd, &key, &value) == 0);

struct bpf_insn prog[] = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* r6 = r1 */
BPF_LD_ABS(BPF_B, 14 + 9), /* r0 = ip->proto */
BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4),/* *(u32 *)(fp - 4) = r0 */
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = r2 - 4 */
BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* r1 = map_fd */
BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), /* r0 = map_lookup(r1, r2) */
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* if (r0 == 0) goto pc+2 */
BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* lock *(u64 *)r0 += r1 */
BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
BPF_EXIT_INSN(), /* return r0 */
};
prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET, prog, sizeof(prog), "GPL");
outside accessible address space.

EINVAL The value specified in cmd is not recognized by this kernel.

EINVAL For BPF_MAP_CREATE, either map_type or attributes are invalid.

EINVAL For BPF_MAP_*_ELEM commands, some of the fields of "union
bpf_attr" unused by this command are not set to zero.

EINVAL For BPF_PROG_LOAD, attempt to load invalid program (unrecognized
instruction or uses reserved fields or jumps out of range or
loop detected or calls unknown function).

EACCES For BPF_PROG_LOAD, though program has valid instructions, it was
rejected, since it was deemed unsafe (may access disallowed
memory region or uninitialized stack/register or function
constraints don't match actual types or misaligned access). In
such case it is recommended to call bpf() again with log_level =
1 and examine log_buf for specific reason provided by verifier.

ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM, indicates that
element with given key was not found.

E2BIG program is too large or a map reached max_entries limit (max
number of elements).

NOTES
These commands may be used only by a privileged process (one having the
CAP_SYS_ADMIN capability).

SEE ALSO
eBPF architecture and instruction set is explained in
Documentation/networking/filter.txt



Linux 2014-09-12 BPF(2)

Alexei Starovoitov

unread,
Sep 15, 2014, 3:20:02 PM9/15/14
to
add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
struct {
...
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
char __user *log_buf; /* user supplied buffer */
};
};

when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.

'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:

BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
BPF_EXIT_INSN(),

will be rejected with the following multi-line message in log_buf:

0: (7a) *(u64 *)(r10 -8) = 0
1: (bf) r2 = r10
2: (07) r2 += -8
3: (b7) r1 = 0
4: (85) call 1
5: (15) if r0 == 0x0 goto pc+1
R0=map_ptr R10=fp
6: (7a) *(u64 *)(r0 +4) = 0
misaligned access off 4 size 8

The format of the output can change at any time as verifier evolves.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/uapi/linux/bpf.h | 3 +
kernel/bpf/syscall.c | 2 +-
kernel/bpf/verifier.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 239 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9d56d128b8c..a6331f2ccf93 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -138,6 +138,9 @@ union bpf_attr {
__u32 insn_cnt;
const struct bpf_insn __user *insns;
const char __user *license;
+ __u32 log_level; /* verbosity level of eBPF verifier */
+ __u32 log_size; /* size of user buffer */
+ char __user *log_buf; /* user supplied buffer */
};
};

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 10a50453d101..7c062a34c696 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -452,7 +452,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
}

/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD license
+#define BPF_PROG_LOAD_LAST_FIELD log_buf

static int bpf_prog_load(union bpf_attr *attr)
{
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6f9c3d6b4d7..c721d1f4d3cc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,9 +125,244 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM) {
+ verbose("(%02x) r%d = 0x%x\n",
+ insn->code, insn->dst_reg, insn->imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
{
+ char __user *log_ubuf = NULL;
+ struct verifier_env *env;
int ret = -EINVAL;

+ if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto free_env;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto free_env;
+ } else {
+ log_level = 0;
+ }
+
+ /* ret = do_check(env); */
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+free_env:
+ kfree(env);
+ mutex_unlock(&bpf_verifier_lock);
return ret;
}
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:01 PM9/15/14
to
done as separate commit to ease conflict resolution

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
arch/x86/syscalls/syscall_32.tbl | 1 +
arch/x86/syscalls/syscall_64.tbl | 1 +
include/linux/syscalls.h | 3 ++-
include/uapi/asm-generic/unistd.h | 4 +++-
kernel/sys_ni.c | 3 +++
5 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 028b78168d85..9fe1b5d002f0 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -363,3 +363,4 @@
354 i386 seccomp sys_seccomp
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
+357 i386 bpf sys_bpf
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 35dd922727b9..281150b539a2 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -327,6 +327,7 @@
318 common getrandom sys_getrandom
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
+321 common bpf sys_bpf

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0f86d85a9ce4..bda9b81357cc 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
struct perf_event_attr;
struct file_handle;
struct sigaltstack;
+union bpf_attr;

#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -875,5 +876,5 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
const char __user *uargs);
asmlinkage long sys_getrandom(char __user *buf, size_t count,
unsigned int flags);
-
+asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 11d11bc5c78f..22749c134117 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp)
__SYSCALL(__NR_getrandom, sys_getrandom)
#define __NR_memfd_create 279
__SYSCALL(__NR_memfd_create, sys_memfd_create)
+#define __NR_bpf 280
+__SYSCALL(__NR_bpf, sys_bpf)

#undef __NR_syscalls
-#define __NR_syscalls 280
+#define __NR_syscalls 281

/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 391d4ddb6f4b..b4b5083f5f5e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -218,3 +218,6 @@ cond_syscall(sys_kcmp);

/* operate on Secure Computing state */
cond_syscall(sys_seccomp);
+
+/* access BPF programs and maps */
+cond_syscall(sys_bpf);

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:01 PM9/15/14
to
this patch adds all of eBPF verfier documentation and empty bpf_check()

The end goal for the verifier is to statically check safety of the program.

Verifier will catch:
- loops
- out of range jumps
- unreachable instructions
- invalid instructions
- uninitialized register access
- uninitialized stack access
- misaligned stack access
- out of range stack access
- invalid calling convention

More details in Documentation/networking/filter.txt

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
Documentation/networking/filter.txt | 224 +++++++++++++++++++++++++++++++++++
include/linux/bpf.h | 2 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/syscall.c | 2 +-
kernel/bpf/verifier.c | 133 +++++++++++++++++++++
5 files changed, 361 insertions(+), 2 deletions(-)
create mode 100644 kernel/bpf/verifier.c

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 1900d29518f1..f1c90967f748 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1001,6 +1001,99 @@ instruction that loads 64-bit immediate value into a dst_reg.
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
32-bit immediate value into a register.

+eBPF verifier
+-------------
+The safety of the eBPF program is determined in two steps.
+
+First step does DAG check to disallow loops and other CFG validation.
+In particular it will detect programs that have unreachable instructions.
+(though classic BPF checker allows them)
+
+Second step starts from the first insn and descends all possible paths.
+It simulates execution of every insn and observes the state change of
+registers and stack.
+
+At the start of the program the register R1 contains a pointer to context
+and has type PTR_TO_CTX.
+If verifier sees an insn that does R2=R1, then R2 has now type
+PTR_TO_CTX as well and can be used on the right hand side of expression.
+If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE,
+since addition of two valid pointers makes invalid pointer.
+(In 'secure' mode verifier will reject any type of pointer arithmetic to make
+sure that kernel addresses don't leak to unprivileged users)
+
+If register was never written to, it's not readable:
+ bpf_mov R0 = R2
+ bpf_exit
+will be rejected, since R2 is unreadable at the start of the program.
+
+After kernel function call, R1-R5 are reset to unreadable and
+R0 has a return type of the function.
+
+Since R6-R9 are callee saved, their state is preserved across the call.
+ bpf_mov R6 = 1
+ bpf_call foo
+ bpf_mov R0 = R6
+ bpf_exit
+is a correct program. If there was R1 instead of R6, it would have
+been rejected.
+
+load/store instructions are allowed only with registers of valid types, which
+are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked.
+For example:
+ bpf_mov R1 = 1
+ bpf_mov R2 = 2
+ bpf_xadd *(u32 *)(R1 + 3) += R2
+ bpf_exit
+will be rejected, since R1 doesn't have a valid pointer type at the time of
+execution of instruction bpf_xadd.
+
+At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context')
+A callback is used to customize verifier to restrict eBPF program access to only
+certain fields within ctx structure with specified size and alignment.
+
+For example, the following insn:
+ bpf_ld R0 = *(u32 *)(R6 + 8)
+intends to load a word from address R6 + 8 and store it into R0
+If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know
+that offset 8 of size 4 bytes can be accessed for reading, otherwise
+the verifier will reject the program.
+If R6=FRAME_PTR, then access should be aligned and be within
+stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8,
+so it will fail verification, since it's out of bounds.
+
+The verifier will allow eBPF program to read data from stack only after
+it wrote into it.
+Classic BPF verifier does similar check with M[0-15] memory slots.
+For example:
+ bpf_ld R0 = *(u32 *)(R10 - 4)
+ bpf_exit
+is invalid program.
+Though R10 is correct read-only register and has type FRAME_PTR
+and R10 - 4 is within stack bounds, there were no stores into that location.
+
+Pointer register spill/fill is tracked as well, since four (R6-R9)
+callee saved registers may not be enough for some programs.
+
+Allowed function calls are customized with bpf_verifier_ops->get_func_proto()
+The eBPF verifier will check that registers match argument constraints.
+After the call register R0 will be set to return type of the function.
+
+Function calls is a main mechanism to extend functionality of eBPF programs.
+Socket filters may let programs to call one set of functions, whereas tracing
+filters may allow completely different set.
+
+If a function made accessible to eBPF program, it needs to be thought through
+from safety point of view. The verifier will guarantee that the function is
+called with valid arguments.
+
+seccomp vs socket filters have different security restrictions for classic BPF.
+Seccomp solves this by two stage verifier: classic BPF verifier is followed
+by seccomp verifier. In case of eBPF one configurable verifier is shared for
+all use cases.
+
+See details of eBPF verifier in kernel/bpf/verifier.c
+
eBPF maps
---------
'maps' is a generic storage of different types for sharing data between kernel
@@ -1040,6 +1133,137 @@ The map is defined by:
. key size in bytes
. value size in bytes

+Understanding eBPF verifier messages
+------------------------------------
+
+The following are few examples of invalid eBPF programs and verifier error
+messages as seen in the log:
+
+Program with unreachable instructions:
+static struct bpf_insn prog[] = {
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+};
+Error:
+ unreachable insn 1
+
+Program that reads uninitialized register:
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r0 = r2
+ R2 !read_ok
+
+Program that doesn't initialize R0 before exiting:
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r2 = r1
+ 1: (95) exit
+ R0 !read_ok
+
+Program that accesses stack out of bounds:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 +8) = 0
+ invalid stack off=8 size=8
+
+Program that doesn't initialize stack before passing its address into function:
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r2 = r10
+ 1: (07) r2 += -8
+ 2: (b7) r1 = 0x0
+ 3: (85) call 1
+ invalid indirect read from stack off -8+0 size 8
+
+Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ fd 0 is not pointing to valid bpf_map
+
+Program that doesn't check return value of map_lookup_elem() before accessing
+map element:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ 5: (7a) *(u64 *)(r0 +0) = 0
+ R0 invalid mem access 'map_value_or_null'
+
+Program that correctly checks map_lookup_elem() returned value for NULL, but
+accesses the memory with incorrect alignment:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+1
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +4) = 0
+ misaligned access off 4 size 8
+
+Program that correctly checks map_lookup_elem() returned value for NULL and
+accesses memory with correct alignment in one side of 'if' branch, but fails
+to do so in the other side of 'if' branch:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+2
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +0) = 0
+ 7: (95) exit
+
+ from 5 to 8: R0=imm0 R10=fp
+ 8: (7a) *(u64 *)(r0 +0) = 1
+ R0 invalid mem access 'imm'
+
Testing
-------

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 92979182be81..9dfeb36f8971 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -83,5 +83,7 @@ struct bpf_prog_aux {

void bpf_prog_put(struct bpf_prog *prog);
struct bpf_prog *bpf_prog_get(u32 ufd);
+/* verify correctness of eBPF program */
+int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);

#endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e9f7334ed07a..3c726b0995b7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o syscall.o
+obj-y := core.o syscall.o verifier.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 158d3aca29e4..10a50453d101 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -500,7 +500,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;

/* run eBPF verifier */
- /* err = bpf_check(prog, tb); */
+ err = bpf_check(prog, attr);

if (err < 0)
goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..d6f9c3d6b4d7
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,133 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ * .arg1_type = ARG_CONST_MAP_PTR,
+ * .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ * void *key = (void *) (unsigned long) r2;
+ * void *value;
+ *
+ * here kernel can access 'key' and 'map' pointers safely, knowing that
+ * [key, key + map->key_size) bytes are valid and were initialized on
+ * the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
+ * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ int ret = -EINVAL;
+
+ return ret;
+}

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:01 PM9/15/14
to
Hi All,

the list of changes v11 -> v12:
- dropped patch 11 and copied few macros to libbpf.h (suggested by Daniel)
- replaced 'enum bpf_prog_type' with u32 to be safe in compat (.. Andy)
- implemented and tested compat support (not part of this set) (.. Daniel)
- changed 'void *log_buf' to 'char *' (.. Daniel)
- combined struct bpf_work_struct and bpf_prog_info (.. Daniel)
- added better return value explanation to manpage (.. Andy)
- added log_buf/log_size explanation to manpage (.. Andy & Daniel)
- added a lot more info about prog_type and map_type to manpage (.. Andy)
- rebased, tweaked test_stubs

Patches 1-4 establish BPF syscall shell for maps and programs.
Patches 5-10 add verifier step by step
Patch 11 adds test stubs for 'unspec' program type and verifier testsuite
from user space

Note that patches 1,3,4,7 add commands and attributes to the syscall
while being backwards compatible from each other, which should demonstrate
how other commands can be added in the future.

After this set the programs can be loaded for testing only. They cannot
be attached to any events. Though manpage talks about tracing and sockets,
it will be a subject of future patches.

Please take a look at manpage:
__u32 insn_cnt;
const struct bpf_insn *insns;
const char *license;
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
char *log_buf; /* user supplied buffer */
};
};

eBPF maps
maps is a generic storage of different types for sharing data between
kernel and userspace.

Any map type has the following attributes:
. type
. max number of elements
. key size in bytes
. value size in bytes

kernel. For all map_types eBPF programs access maps with the
bpf_memcmp(void *ptr1, void *ptr2, int size) // non-faulting memcmp
bpf_fetch_ptr(void *ptr) // non-faulting load pointer from any address
bpf_fetch_u8(void *ptr) // non-faulting 1 byte load
bpf_fetch_u16(void *ptr) // other non-faulting loads
bpf_fetch_u32(void *ptr)
bpf_fetch_u64(void *ptr)

and bpf_context is defined as:
struct bpf_context {
/* argN fields match one to one to arguments passed to trace events */
u64 arg1, arg2, arg3, arg4, arg5, arg6;
/* return value from kretprobe event or from syscall_exit event */
u64 ret;
};

The set of helper functions for TYPE_SOCKET is TBD.

More program types may be added in the future. Like
BPF_PROG_TYPE_USER_TRACING for unprivileged programs.

BPF_PROG_TYPE_UNSPEC is used for testing only. Such programs
cannot be attached to events.

insns array of "struct bpf_insn" instructions

insn_cnt number of instructions in the program

license license string, which must be GPL compatible to call
helper functions marked gpl_only

log_buf user supplied buffer that in-kernel verifier is using to
store verification log. Log is a multi-line string that should
be used by program author to understand how verifier came to
conclusion that program is unsafe. The format of the output can
change at any time as verifier evolves.

Linux 2014-09-15 BPF(2)

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:02 PM9/15/14
to
'maps' is a generic storage of different types for sharing data between kernel
and userspace.

The maps are accessed from user space via BPF syscall, which has commands:

- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error

- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error

- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error

- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key

- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key

- close(fd) deletes the map

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 8 ++
include/uapi/linux/bpf.h | 38 ++++++++
kernel/bpf/syscall.c | 229 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 275 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 48014a71f0fe..2887f3f9da59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -9,6 +9,7 @@

#include <uapi/linux/bpf.h>
#include <linux/workqueue.h>
+#include <linux/file.h>

struct bpf_map;

@@ -17,6 +18,12 @@ struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_free)(struct bpf_map *);
+ int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+
+ /* funcs callable from userspace and from eBPF programs */
+ void *(*map_lookup_elem)(struct bpf_map *map, void *key);
+ int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+ int (*map_delete_elem)(struct bpf_map *map, void *key);
};

struct bpf_map {
@@ -37,5 +44,6 @@ struct bpf_map_type_list {

void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
+struct bpf_map *bpf_map_get(struct fd f);

#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ecc8fa48a2a4..d4ba8bb66bd4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -70,6 +70,35 @@ enum bpf_cmd {
* map is deleted when fd is closed
*/
BPF_MAP_CREATE,
+
+ /* lookup key in a given map
+ * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->value
+ * returns zero and stores found elem into value
+ * or negative error
+ */
+ BPF_MAP_LOOKUP_ELEM,
+
+ /* create or update key/value pair in a given map
+ * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->value
+ * returns zero or negative error
+ */
+ BPF_MAP_UPDATE_ELEM,
+
+ /* find and delete elem by key in a given map
+ * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key
+ * returns zero or negative error
+ */
+ BPF_MAP_DELETE_ELEM,
+
+ /* lookup key in a given map and return next key
+ * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->next_key
+ * returns zero and stores next key or negative error
+ */
+ BPF_MAP_GET_NEXT_KEY,
};

enum bpf_map_type {
@@ -83,6 +112,15 @@ union bpf_attr {
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
};
+
+ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+ int map_fd;
+ void __user *key;
+ union {
+ void __user *value;
+ void __user *next_key;
+ };
+ };
};

#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 80082ec5a5fc..fdf7bd79cf76 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
+#include <linux/file.h>

static LIST_HEAD(bpf_map_types);

@@ -111,6 +112,222 @@ free_map:
return err;
}

+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+ struct bpf_map *map;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = f.file->private_data;
+
+ return map;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ void __user *uvalue = attr->value;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ESRCH;
+ rcu_read_lock();
+ value = map->ops->map_lookup_elem(map, key);
+ if (!value)
+ goto err_unlock;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto err_unlock;
+
+ err = 0;
+
+err_unlock:
+ rcu_read_unlock();
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ void __user *uvalue = attr->value;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = attr->key;
+ void __user *unext_key = attr->next_key;
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *next_key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr *attr;
@@ -140,6 +357,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_CREATE:
err = map_create(attr);
break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(attr);
+ break;
default:
err = -EINVAL;
break;
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:02 PM9/15/14
to
1.
the library includes a trivial set of BPF syscall wrappers:
int bpf_create_map(int key_size, int value_size, int max_entries);
int bpf_update_elem(int fd, void *key, void *value);
int bpf_lookup_elem(int fd, void *key, void *value);
int bpf_delete_elem(int fd, void *key);
int bpf_get_next_key(int fd, void *key, void *next_key);
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct sock_filter_int *insns, int insn_len,
const char *license);
bpf_prog_load() stores verifier log into global bpf_log_buf[] array

and BPF_*() macros to build instructions

2.
test stubs configure eBPF infra with 'unspec' map and program types.
These are fake types used by user space testsuite only.

3.
verifier tests valid and invalid programs and expects predefined
error log messages from kernel.
40 tests so far.

$ sudo ./test_verifier
#0 add+sub+mul OK
#1 unreachable OK
#2 unreachable2 OK
#3 out of range jump OK
#4 out of range jump2 OK
#5 test1 ld_imm64 OK
...

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/Makefile | 4 +
kernel/bpf/test_stub.c | 116 +++++++++
lib/Kconfig.debug | 3 +-
samples/bpf/Makefile | 12 +
samples/bpf/libbpf.c | 89 +++++++
samples/bpf/libbpf.h | 172 ++++++++++++++
samples/bpf/test_verifier.c | 548 +++++++++++++++++++++++++++++++++++++++++++
7 files changed, 943 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/test_stub.c
create mode 100644 samples/bpf/Makefile
create mode 100644 samples/bpf/libbpf.c
create mode 100644 samples/bpf/libbpf.h
create mode 100644 samples/bpf/test_verifier.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3c726b0995b7..45427239f375 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
obj-y := core.o syscall.o verifier.o
+
+ifdef CONFIG_TEST_BPF
+obj-y += test_stub.o
+endif
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..fcaddff4003e
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bpf.h>
+
+/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
+ * to be used by user space verifier testsuite
+ */
+struct bpf_context {
+ u64 arg1;
+ u64 arg2;
+};
+
+static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+static struct bpf_func_proto test_funcs[] = {
+ [BPF_FUNC_unspec] = {
+ .func = test_func,
+ .gpl_only = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ },
+};
+
+static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
+{
+ if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+ return NULL;
+ return &test_funcs[func_id];
+}
+
+static const struct bpf_context_access {
+ int size;
+ enum bpf_access_type type;
+} test_ctx_access[] = {
+ [offsetof(struct bpf_context, arg1)] = {
+ FIELD_SIZEOF(struct bpf_context, arg1),
+ BPF_READ
+ },
+ [offsetof(struct bpf_context, arg2)] = {
+ FIELD_SIZEOF(struct bpf_context, arg2),
+ BPF_READ
+ },
+};
+
+static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ const struct bpf_context_access *access;
+
+ if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
+ return false;
+
+ access = &test_ctx_access[off];
+ if (access->size == size && (access->type & type))
+ return true;
+
+ return false;
+}
+
+static struct bpf_verifier_ops test_ops = {
+ .get_func_proto = test_func_proto,
+ .is_valid_access = test_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl_prog = {
+ .ops = &test_ops,
+ .type = BPF_PROG_TYPE_UNSPEC,
+};
+
+static struct bpf_map *test_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+
+ map = kzalloc(sizeof(*map), GFP_USER);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->key_size = attr->key_size;
+ map->value_size = attr->value_size;
+ map->max_entries = attr->max_entries;
+ return map;
+}
+
+static void test_map_free(struct bpf_map *map)
+{
+ kfree(map);
+}
+
+static struct bpf_map_ops test_map_ops = {
+ .map_alloc = test_map_alloc,
+ .map_free = test_map_free,
+};
+
+static struct bpf_map_type_list tl_map = {
+ .ops = &test_map_ops,
+ .type = BPF_MAP_TYPE_UNSPEC,
+};
+
+static int __init register_test_ops(void)
+{
+ bpf_register_map_type(&tl_map);
+ bpf_register_prog_type(&tl_prog);
+ return 0;
+}
+late_initcall(register_test_ops);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a28590083622..3ac43f34437b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1672,7 +1672,8 @@ config TEST_BPF
against the BPF interpreter or BPF JIT compiler depending on the
current setting. This is in particular useful for BPF JIT compiler
development, but also to run regression tests against changes in
- the interpreter code.
+ the interpreter code. It also enables test stubs for eBPF maps and
+ verifier used by user space verifier testsuite.

If unsure, say N.

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
new file mode 100644
index 000000000000..634391797856
--- /dev/null
+++ b/samples/bpf/Makefile
@@ -0,0 +1,12 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := test_verifier
+
+test_verifier-objs := test_verifier.o libbpf.o
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS += -I$(objtree)/usr/include
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
new file mode 100644
index 000000000000..cae0c734274c
--- /dev/null
+++ b/samples/bpf/libbpf.c
@@ -0,0 +1,89 @@
+/* eBPF mini library */
+#include <stdlib.h>
+#include <stdio.h>
+#include <linux/unistd.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/netlink.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include "libbpf.h"
+
+int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
+ int max_entries)
+{
+ union bpf_attr attr = {
+ .map_type = map_type,
+ .key_size = key_size,
+ .value_size = value_size,
+ .max_entries = max_entries
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+
+int bpf_update_elem(int fd, void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ .value = value,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_lookup_elem(int fd, void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ .value = value,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_delete_elem(int fd, void *key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_get_next_key(int fd, void *key, void *next_key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = key,
+ .next_key = next_key,
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+}
+
+#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
+
+char bpf_log_buf[LOG_BUF_SIZE];
+
+int bpf_prog_load(enum bpf_prog_type prog_type,
+ const struct bpf_insn *insns, int prog_len,
+ const char *license)
+{
+ union bpf_attr attr = {
+ .prog_type = prog_type,
+ .insns = insns,
+ .insn_cnt = prog_len / sizeof(struct bpf_insn),
+ .license = license,
+ .log_buf = bpf_log_buf,
+ .log_size = LOG_BUF_SIZE,
+ .log_level = 1,
+ };
+
+ bpf_log_buf[0] = 0;
+
+ return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
new file mode 100644
index 000000000000..8a31babeca5d
--- /dev/null
+++ b/samples/bpf/libbpf.h
@@ -0,0 +1,172 @@
+/* eBPF mini library */
+#ifndef __LIBBPF_H
+#define __LIBBPF_H
+
+struct bpf_insn;
+
+int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
+ int max_entries);
+int bpf_update_elem(int fd, void *key, void *value);
+int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_delete_elem(int fd, void *key);
+int bpf_get_next_key(int fd, void *key, void *next_key);
+
+int bpf_prog_load(enum bpf_prog_type prog_type,
+ const struct bpf_insn *insns, int insn_len,
+ const char *license);
+
+#define LOG_BUF_SIZE 8192
+extern char bpf_log_buf[LOG_BUF_SIZE];
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM) \
+ BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_DW | BPF_IMM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = (__u32) (IMM) }), \
+ ((struct bpf_insn) { \
+ .code = 0, /* zero is reserved opcode */ \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = ((__u64) (IMM)) >> 32 })
+
+#define BPF_PSEUDO_MAP_FD 1
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = CODE, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN() \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_EXIT, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = 0 })
+
+#endif
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
new file mode 100644
index 000000000000..d10992e2740e
--- /dev/null
+++ b/samples/bpf/test_verifier.c
@@ -0,0 +1,548 @@
+/*
+ * Testsuite for eBPF verifier
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include "libbpf.h"
+
+#define MAX_INSNS 512
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct bpf_test {
+ const char *descr;
+ struct bpf_insn insns[MAX_INSNS];
+ int fixup[32];
+ const char *errstr;
+ enum {
+ ACCEPT,
+ REJECT
+ } result;
+};
+
+static struct bpf_test tests[] = {
+ {
+ "add+sub+mul",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_2, 3),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 3),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ },
+ {
+ "unreachable",
+ .insns = {
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable",
+ .result = REJECT,
+ },
+ {
+ "unreachable2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable",
+ .result = REJECT,
+ },
+ {
+ "out of range jump",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+ },
+ {
+ "out of range jump2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, -2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+ },
+ {
+ "test1 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM insn",
+ .result = REJECT,
+ },
+ {
+ "test2 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM insn",
+ .result = REJECT,
+ },
+ {
+ "test3 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+ },
+ {
+ "test4 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+ },
+ {
+ "test5 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+ },
+ {
+ "no bpf_exit",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+ },
+ {
+ "loop (back-edge)",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "back-edge",
+ .result = REJECT,
+ },
+ {
+ "loop2 (back-edge)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "back-edge",
+ .result = REJECT,
+ },
+ {
+ "conditional loop",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "back-edge",
+ .result = REJECT,
+ },
+ {
+ "read uninitialized register",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R2 !read_ok",
+ .result = REJECT,
+ },
+ {
+ "read invalid register",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R15 is invalid",
+ .result = REJECT,
+ },
+ {
+ "program doesn't init R0 before exit",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 !read_ok",
+ .result = REJECT,
+ },
+ {
+ "stack out of bounds",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid stack",
+ .result = REJECT,
+ },
+ {
+ "invalid call insn1",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_CALL uses reserved",
+ .result = REJECT,
+ },
+ {
+ "invalid call insn2",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_CALL uses reserved",
+ .result = REJECT,
+ },
+ {
+ "invalid function call",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 1234567),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid func 1234567",
+ .result = REJECT,
+ },
+ {
+ "uninitialized stack1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {2},
+ .errstr = "invalid indirect read from stack",
+ .result = REJECT,
+ },
+ {
+ "uninitialized stack2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid read from stack",
+ .result = REJECT,
+ },
+ {
+ "check valid spill/fill",
+ .insns = {
+ /* spill R1(ctx) into stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+
+ /* fill it back into R2 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8),
+
+ /* should be able to access R0 = *(R2 + 8) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ },
+ {
+ "check corrupted spill/fill",
+ .insns = {
+ /* spill R1(ctx) into stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+
+ /* mess up with R1 pointer on stack */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23),
+
+ /* fill back into R0 should fail */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "corrupted spill",
+ .result = REJECT,
+ },
+ {
+ "invalid src register in STX",
+ .insns = {
+ BPF_STX_MEM(BPF_B, BPF_REG_10, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R15 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid dst register in STX",
+ .insns = {
+ BPF_STX_MEM(BPF_B, 14, BPF_REG_10, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R14 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid dst register in ST",
+ .insns = {
+ BPF_ST_MEM(BPF_B, 14, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R14 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid src register in LDX",
+ .insns = {
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, 12, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R12 is invalid",
+ .result = REJECT,
+ },
+ {
+ "invalid dst register in LDX",
+ .insns = {
+ BPF_LDX_MEM(BPF_B, 11, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R11 is invalid",
+ .result = REJECT,
+ },
+ {
+ "junk insn",
+ .insns = {
+ BPF_RAW_INSN(0, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM",
+ .result = REJECT,
+ },
+ {
+ "junk insn2",
+ .insns = {
+ BPF_RAW_INSN(1, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_LDX uses reserved fields",
+ .result = REJECT,
+ },
+ {
+ "junk insn3",
+ .insns = {
+ BPF_RAW_INSN(-1, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_ALU opcode f0",
+ .result = REJECT,
+ },
+ {
+ "junk insn4",
+ .insns = {
+ BPF_RAW_INSN(-1, -1, -1, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_ALU opcode f0",
+ .result = REJECT,
+ },
+ {
+ "junk insn5",
+ .insns = {
+ BPF_RAW_INSN(0x7f, -1, -1, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_ALU uses reserved fields",
+ .result = REJECT,
+ },
+ {
+ "misaligned read from stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned access",
+ .result = REJECT,
+ },
+ {
+ "invalid map_fd for function call",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "fd 0 is not pointing to valid bpf_map",
+ .result = REJECT,
+ },
+ {
+ "don't check return value before access",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {3},
+ .errstr = "R0 invalid mem access 'map_value_or_null'",
+ .result = REJECT,
+ },
+ {
+ "access memory with incorrect alignment",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {3},
+ .errstr = "misaligned access",
+ .result = REJECT,
+ },
+ {
+ "sometimes access memory with incorrect alignment",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup = {3},
+ .errstr = "R0 invalid mem access",
+ .result = REJECT,
+ },
+};
+
+static int probe_filter_length(struct bpf_insn *fp)
+{
+ int len = 0;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+
+ return len + 1;
+}
+
+static int create_map(void)
+{
+ long long key, value = 0;
+ int map_fd;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ }
+
+ return map_fd;
+}
+
+static int test(void)
+{
+ int prog_fd, i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct bpf_insn *prog = tests[i].insns;
+ int prog_len = probe_filter_length(prog);
+ int *fixup = tests[i].fixup;
+ int map_fd = -1;
+
+ if (*fixup) {
+ map_fd = create_map();
+
+ do {
+ prog[*fixup].imm = map_fd;
+ fixup++;
+ } while (*fixup);
+ }
+ printf("#%d %s ", i, tests[i].descr);
+
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog,
+ prog_len * sizeof(struct bpf_insn),
+ "GPL");
+
+ if (tests[i].result == ACCEPT) {
+ if (prog_fd < 0) {
+ printf("FAIL\nfailed to load prog '%s'\n",
+ strerror(errno));
+ printf("%s", bpf_log_buf);
+ goto fail;
+ }
+ } else {
+ if (prog_fd >= 0) {
+ printf("FAIL\nunexpected success to load\n");
+ printf("%s", bpf_log_buf);
+ goto fail;
+ }
+ if (strstr(bpf_log_buf, tests[i].errstr) == 0) {
+ printf("FAIL\nunexpected error message: %s",
+ bpf_log_buf);
+ goto fail;
+ }
+ }
+
+ printf("OK\n");
+fail:
+ if (map_fd >= 0)
+ close(map_fd);
+ close(prog_fd);
+
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ return test();
+}
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:04 PM9/15/14
to
This patch adds verifier core which simulates execution of every insn and
records the state of registers and program stack. Every branch instruction seen
during simulation is pushed into state stack. When verifier reaches BPF_EXIT,
it pops the state from the stack and continues until it reaches BPF_EXIT again.
For program:
1: bpf_mov r1, xxx
2: if (r1 == 0) goto 5
3: bpf_mov r0, 1
4: goto 6
5: bpf_mov r0, 2
6: bpf_exit
The verifier will walk insns: 1, 2, 3, 4, 6
then it will pop the state recorded at insn#2 and will continue: 5, 6

This way it walks all possible paths through the program and checks all
possible values of registers. While doing so, it checks for:
- invalid instructions
- uninitialized register access
- uninitialized stack access
- misaligned stack access
- out of range stack access
- invalid calling convention
- instruction encoding is not using reserved fields

Kernel subsystem configures the verifier with two callbacks:

- bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
that provides information to the verifer which fields of 'ctx'
are accessible (remember 'ctx' is the first argument to eBPF program)

- const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
returns argument constraints of kernel helper functions that eBPF program
may call, so that verifier can checks that R1-R5 types match the prototype

More details in Documentation/networking/filter.txt and in kernel/bpf/verifier.c

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 47 +++
kernel/bpf/verifier.c | 1003 ++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 1049 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dfeb36f8971..3cf91754a957 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -46,6 +46,31 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
struct bpf_map *bpf_map_get(struct fd f);

+/* function argument constraints */
+enum bpf_arg_type {
+ ARG_ANYTHING = 0, /* any argument is ok */
+
+ /* the following constraints used to prototype
+ * bpf_map_lookup/update/delete_elem() functions
+ */
+ ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */
+ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */
+ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */
+
+ /* the following constraints used to prototype bpf_memcmp() and other
+ * functions that access data on eBPF program stack
+ */
+ ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */
+ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */
+};
+
+/* type of values returned from helper functions */
+enum bpf_return_type {
+ RET_INTEGER, /* function returns integer */
+ RET_VOID, /* function doesn't return anything */
+ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
+};
+
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
* to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
* instructions after verifying
@@ -53,11 +78,33 @@ struct bpf_map *bpf_map_get(struct fd f);
struct bpf_func_proto {
u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
bool gpl_only;
+ enum bpf_return_type ret_type;
+ enum bpf_arg_type arg1_type;
+ enum bpf_arg_type arg2_type;
+ enum bpf_arg_type arg3_type;
+ enum bpf_arg_type arg4_type;
+ enum bpf_arg_type arg5_type;
+};
+
+/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
+ * the first argument to eBPF programs.
+ * For socket filters: 'struct bpf_context *' == 'struct sk_buff *'
+ */
+struct bpf_context;
+
+enum bpf_access_type {
+ BPF_READ = 1,
+ BPF_WRITE = 2
};

struct bpf_verifier_ops {
/* return eBPF function prototype for verification */
const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
+
+ /* return true if 'size' wide access at offset 'off' within bpf_context
+ * with 'type' (read or write) is allowed
+ */
+ bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
};

struct bpf_prog_type_list {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d13d9ed13e2a..f185c0f75814 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,6 +125,72 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+#define _(OP) ({ int ret = (OP); if (ret < 0) return ret; })
+
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+ NOT_INIT = 0, /* nothing was written into register */
+ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
+ PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+ FRAME_PTR, /* reg == frame_pointer */
+ PTR_TO_STACK, /* reg == frame_pointer + imm */
+ CONST_IMM, /* constant integer value */
+};
+
+struct reg_state {
+ enum bpf_reg_type type;
+ union {
+ /* valid when type == CONST_IMM | PTR_TO_STACK */
+ int imm;
+
+ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+ * PTR_TO_MAP_VALUE_OR_NULL
+ */
+ struct bpf_map *map_ptr;
+ };
+};
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* 1st byte of register spilled into stack */
+ STACK_SPILL_PART, /* other 7 bytes of register spill */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+struct bpf_stack_slot {
+ enum bpf_stack_slot_type stype;
+ struct reg_state reg_st;
+};
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_BPF_REG];
+ struct bpf_stack_slot stack[MAX_BPF_STACK];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+ /* verifer state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct verifier_stack_elem *next;
+};
+
#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */

/* single container for all structs
@@ -132,6 +198,9 @@
*/
struct verifier_env {
struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
};
@@ -160,6 +229,45 @@ static void verbose(const char *fmt, ...)
va_end(args);
}

+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (env->cur_state.stack[i].stype == STACK_SPILL)
+ verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ reg_type_str[env->cur_state.stack[i].reg_st.type]);
+ }
+ verbose("\n");
+}
+
static const char *const bpf_class_string[] = {
[BPF_LD] = "ld",
[BPF_LDX] = "ldx",
@@ -305,6 +413,695 @@ static void print_bpf_insn(struct bpf_insn *insn)
}
}

+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ int insn_idx;
+
+ if (env->head == NULL)
+ return -1;
+
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ if (prev_insn_idx)
+ *prev_insn_idx = env->head->prev_insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+ int prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 1024) {
+ verbose("BPF program is too complex\n");
+ goto err;
+ }
+ return &elem->st;
+err:
+ /* pop all elements and return */
+ while (pop_stack(env, NULL) >= 0);
+ return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ regs[i].type = NOT_INIT;
+ regs[i].imm = 0;
+ regs[i].map_ptr = NULL;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = FRAME_PTR;
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].imm = 0;
+ regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+ SRC_OP, /* register is used as source operand */
+ DST_OP, /* register is used as destination operand */
+ DST_OP_NO_MARK /* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ if (regno >= MAX_BPF_REG) {
+ verbose("R%d is invalid\n", regno);
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ struct bpf_stack_slot *slot;
+ int i;
+
+ if (value_regno >= 0 &&
+ (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+ state->regs[value_regno].type == PTR_TO_STACK ||
+ state->regs[value_regno].type == PTR_TO_CTX)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != 8) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+ slot->stype = STACK_SPILL;
+ /* save register state */
+ slot->reg_st = state->regs[value_regno];
+ for (i = 1; i < 8; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->stype = STACK_SPILL_PART;
+ slot->reg_st.type = UNKNOWN_VALUE;
+ slot->reg_st.map_ptr = NULL;
+ }
+ } else {
+
+ /* regular write of data into stack */
+ for (i = 0; i < size; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->stype = STACK_MISC;
+ slot->reg_st.type = UNKNOWN_VALUE;
+ slot->reg_st.map_ptr = NULL;
+ }
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ struct bpf_stack_slot *slot;
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+
+ if (slot->stype == STACK_SPILL) {
+ if (size != 8) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < 8; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_SPILL_PART) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] = slot->reg_st;
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->aux->ops->is_valid_access &&
+ env->prog->aux->ops->is_valid_access(off, size, t))
+ return 0;
+
+ verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size;
+
+ _(size = bpf_size_to_bytes(bpf_size));
+
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ _(check_map_access(env, regno, off, size));
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == PTR_TO_CTX) {
+ _(check_ctx_access(env, off, size, t));
+ if (t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == FRAME_PTR) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE)
+ _(check_stack_write(state, off, size, value_regno));
+ else
+ _(check_stack_read(state, off, size, value_regno));
+ } else {
+ verbose("R%d invalid mem access '%s'\n",
+ regno, reg_type_str[state->regs[regno].type]);
+ return -EACCES;
+ }
+ return 0;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check whether atomic_add can read the memory */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1));
+
+ /* check whether atomic_add can write into the same memory */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1));
+ return 0;
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ /* bpf_map_xxx(..., map_ptr, ..., key) call:
+ * check that [key, key + map->key_size) are within
+ * stack limits and initialized
+ */
+ if (!*mapp) {
+ /* in function declaration map_ptr must come before
+ * map_key, so that it's verified and known before
+ * we have to check map_key here. Otherwise it means
+ * that kernel subsystem misconfigured verifier
+ */
+ verbose("invalid map_ptr to access map->key\n");
+ return -EACCES;
+ }
+ _(check_stack_boundary(env, regno, (*mapp)->key_size));
+
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+ /* bpf_map_xxx(..., map_ptr, ..., value) call:
+ * check [value, value + map->value_size) validity
+ */
+ if (!*mapp) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("invalid map_ptr to access map->value\n");
+ return -EACCES;
+ }
+ _(check_stack_boundary(env, regno, (*mapp)->value_size));
+
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ /* bpf_xxx(..., buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ * note: regno == len, regno - 1 == buf
+ */
+ if (regno == 0) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ return -EACCES;
+ }
+ _(check_stack_boundary(env, regno - 1, reg->imm));
+ }
+
+ return 0;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->ops->get_func_proto)
+ fn = env->prog->aux->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ /* check args */
+ _(check_func_arg(env, BPF_REG_1, fn->arg1_type, &map));
+ _(check_func_arg(env, BPF_REG_2, fn->arg2_type, &map));
+ _(check_func_arg(env, BPF_REG_3, fn->arg3_type, &map));
+ _(check_func_arg(env, BPF_REG_4, fn->arg4_type, &map));
+ _(check_func_arg(env, BPF_REG_5, fn->arg5_type, &map));
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* update return register */
+ if (fn->ret_type == RET_INTEGER) {
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ } else if (fn->ret_type == RET_VOID) {
+ regs[BPF_REG_0].type = NOT_INIT;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ /* remember map_ptr, so that check_map_access()
+ * can check 'value_size' boundary of memory access
+ * to map element returned from bpf_map_lookup_elem()
+ */
+ if (map == NULL) {
+ verbose("kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = true;
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
/* return the map pointer stored inside BPF_LD_IMM64 instruction */
static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
{
@@ -313,6 +1110,34 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
return (struct bpf_map *) (unsigned long) imm64;
}

+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -492,6 +1317,181 @@ free_st:
return ret;
}

+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(insn);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ _(check_alu_op(regs, insn));
+
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK));
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ _(check_mem_access(env, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg));
+
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ _(check_xadd(env, insn));
+ insn_idx++;
+ continue;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check that memory (dst_reg + off) is writeable */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg));
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check that memory (dst_reg + off) is writeable */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1));
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ _(check_call(env, insn->imm));
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ _(check_reg_arg(regs, BPF_REG_0, SRC_OP));
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ _(check_cond_jmp_op(env, insn, &insn_idx));
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ verbose("LD_ABS is not supported yet\n");
+ return -EINVAL;
+ } else if (mode == BPF_IMM) {
+ _(check_ld_imm(env, insn));
+ insn_idx++;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
@@ -645,9 +1645,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;

- /* ret = do_check(env); */
+ ret = do_check(env);

skip_full_check:
+ while (pop_stack(env, NULL) >= 0);

if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:04 PM9/15/14
to
BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.

'maps' is a generic storage of different types for sharing data between kernel
and userspace.

Userspace example:
/* this syscall wrapper creates a map with given type and attributes
* and returns map_fd on success.
* use close(map_fd) to delete the map
*/
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};

return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}

syscall is using 'union bpf_attr' to be backwards compatible with future
extensions. Different syscall commands will use different attributes.

More details in Documentation/networking/filter.txt and in manpage

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
Documentation/networking/filter.txt | 39 +++++++++
include/linux/bpf.h | 41 ++++++++++
include/uapi/linux/bpf.h | 23 ++++++
kernel/bpf/Makefile | 2 +-
kernel/bpf/syscall.c | 151 +++++++++++++++++++++++++++++++++++
5 files changed, 255 insertions(+), 1 deletion(-)
create mode 100644 include/linux/bpf.h
create mode 100644 kernel/bpf/syscall.c

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 81916ab5d96f..1900d29518f1 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg.
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
32-bit immediate value into a register.

+eBPF maps
+---------
+'maps' is a generic storage of different types for sharing data between kernel
+and userspace.
+
+The maps are accessed from user space via BPF syscall, which has commands:
+- create a map with given type and attributes
+ map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
+ using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
+ returns process-local file descriptor or negative error
+
+- lookup key in a given map
+ err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+ using attr->map_fd, attr->key, attr->value
+ returns zero and stores found elem into value or negative error
+
+- create or update key/value pair in a given map
+ err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+ using attr->map_fd, attr->key, attr->value
+ returns zero or negative error
+
+- find and delete element by key in a given map
+ err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+ using attr->map_fd, attr->key
+
+- to delete map: close(fd)
+ Exiting process will delete maps automatically
+
+userspace programs use this syscall to create/access maps that eBPF programs
+are concurrently updating.
+
+maps can have different types: hash, array, bloom filter, radix-tree, etc.
+
+The map is defined by:
+ . type
+ . max number of elements
+ . key size in bytes
+ . value size in bytes
+
Testing
-------

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
new file mode 100644
index 000000000000..48014a71f0fe
--- /dev/null
+++ b/include/linux/bpf.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_BPF_H
+#define _LINUX_BPF_H 1
+
+#include <uapi/linux/bpf.h>
+#include <linux/workqueue.h>
+
+struct bpf_map;
+
+/* map is generic key/value storage optionally accesible by eBPF programs */
+struct bpf_map_ops {
+ /* funcs callable from userspace (via syscall) */
+ struct bpf_map *(*map_alloc)(union bpf_attr *attr);
+ void (*map_free)(struct bpf_map *);
+};
+
+struct bpf_map {
+ atomic_t refcnt;
+ enum bpf_map_type map_type;
+ u32 key_size;
+ u32 value_size;
+ u32 max_entries;
+ struct bpf_map_ops *ops;
+ struct work_struct work;
+};
+
+struct bpf_map_type_list {
+ struct list_head list_node;
+ struct bpf_map_ops *ops;
+ enum bpf_map_type type;
+};
+
+void bpf_register_map_type(struct bpf_map_type_list *tl);
+void bpf_map_put(struct bpf_map *map);
+
+#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 479ed0b6be16..ecc8fa48a2a4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -62,4 +62,27 @@ struct bpf_insn {
__s32 imm; /* signed immediate constant */
};

+/* BPF syscall commands */
+enum bpf_cmd {
+ /* create a map with given type and attributes
+ * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
+ * returns fd or negative error
+ * map is deleted when fd is closed
+ */
+ BPF_MAP_CREATE,
+};
+
+enum bpf_map_type {
+ BPF_MAP_TYPE_UNSPEC,
+};
+
+union bpf_attr {
+ struct { /* anonymous struct used by BPF_MAP_CREATE command */
+ __u32 map_type; /* one of enum bpf_map_type */
+ __u32 key_size; /* size of key in bytes */
+ __u32 value_size; /* size of value in bytes */
+ __u32 max_entries; /* max number of entries in a map */
+ };
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..e9f7334ed07a 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o
+obj-y := core.o syscall.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..80082ec5a5fc
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,151 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+
+ bpf_map_put(map);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ map->ops->map_free(map);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr *attr;
+ int err;
+
+ /* the syscall is limited to root temporarily. This restriction will be
+ * lifted when security audit is clean. Note that eBPF+tracing must have
+ * this restriction, since it may pass kernel data to user space
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* newer userspace cannot run with older kernel */
+ if (size > sizeof(*attr))
+ return -EINVAL;
+
+ attr = kzalloc(sizeof(*attr), GFP_USER);
+ if (!attr)
+ return -ENOMEM;
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ err = -EFAULT;
+ if (copy_from_user(attr, uattr, size) != 0)
+ goto free_attr;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+free_attr:
+ kfree(attr);
+ return err;
+}
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:03 PM9/15/14
to
check that control flow graph of eBPF program is a directed acyclic graph

check_cfg() does:
- detect loops
- detect unreachable instructions
- check that program terminates with BPF_EXIT insn
- check that all branches are within program boundary

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/verifier.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 183 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04d327a9a0d6..d13d9ed13e2a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -313,6 +313,185 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
return (struct bpf_map *) (unsigned long) imm64;
}

+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+#define PUSH_INT(I) \
+ do { \
+ if (cur_stack >= insn_cnt) { \
+ ret = -E2BIG; \
+ goto free_st; \
+ } \
+ stack[cur_stack++] = I; \
+ } while (0)
+
+#define PEEK_INT() \
+ ({ \
+ int _ret; \
+ if (cur_stack == 0) \
+ _ret = -1; \
+ else \
+ _ret = stack[cur_stack - 1]; \
+ _ret; \
+ })
+
+#define POP_INT() \
+ ({ \
+ int _ret; \
+ if (cur_stack == 0) \
+ _ret = -1; \
+ else \
+ _ret = stack[--cur_stack]; \
+ _ret; \
+ })
+
+#define PUSH_INSN(T, W, E) \
+ do { \
+ int w = W; \
+ if (E == FALLTHROUGH && st[T] >= (DISCOVERED | FALLTHROUGH)) \
+ break; \
+ if (E == BRANCH && st[T] >= (DISCOVERED | BRANCH)) \
+ break; \
+ if (w < 0 || w >= insn_cnt) { \
+ verbose("jump out of range from insn %d to %d\n", T, w); \
+ ret = -EINVAL; \
+ goto free_st; \
+ } \
+ if (st[w] == 0) { \
+ /* tree-edge */ \
+ st[T] = DISCOVERED | E; \
+ st[w] = DISCOVERED; \
+ PUSH_INT(w); \
+ goto peek_stack; \
+ } else if ((st[w] & 0xF0) == DISCOVERED) { \
+ verbose("back-edge from insn %d to %d\n", T, w); \
+ ret = -EINVAL; \
+ goto free_st; \
+ } else if (st[w] == EXPLORED) { \
+ /* forward- or cross-edge */ \
+ st[T] = DISCOVERED | E; \
+ } else { \
+ verbose("insn state internal bug\n"); \
+ ret = -EFAULT; \
+ goto free_st; \
+ } \
+ } while (0)
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int cur_stack = 0;
+ int *stack;
+ int ret = 0;
+ int *st;
+ int i, t;
+
+ st = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!st)
+ return -ENOMEM;
+
+ stack = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!stack) {
+ kfree(st);
+ return -ENOMEM;
+ }
+
+ st[0] = DISCOVERED; /* mark 1st insn as discovered */
+ PUSH_INT(0);
+
+peek_stack:
+ while ((t = PEEK_INT()) != -1) {
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto free_st;
+ }
+ /* unconditional jump with single edge */
+ PUSH_INSN(t, t + insns[t].off + 1, FALLTHROUGH);
+ } else {
+ /* conditional jump with two edges */
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ PUSH_INSN(t, t + insns[t].off + 1, BRANCH);
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ }
+
+mark_explored:
+ st[t] = EXPLORED;
+ if (POP_INT() == -1) {
+ verbose("pop_int internal bug\n");
+ ret = -EFAULT;
+ goto free_st;
+ }
+ }
+
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (st[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto free_st;
+ }
+ }
+
+free_st:
+ kfree(st);
+ kfree(stack);
+ return ret;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
@@ -462,6 +641,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;

+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
/* ret = do_check(env); */

skip_full_check:
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:02 PM9/15/14
to
eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions
to refer to process-local map_fd. Scan the program for such instructions and
if FDs are valid, convert them to 'struct bpf_map' pointers which will be used
by verifier to check access to maps in bpf_map_lookup/update() calls.
If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping
BPF_PSEUDO_MAP_FD flag.

Note that eBPF interpreter is generic and knows nothing about pseudo insns.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/filter.h | 6 ++
kernel/bpf/verifier.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 153 insertions(+)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4ffc0958d85e..ca95abd2bed1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -145,6 +145,12 @@ struct bpf_prog_aux;
.off = 0, \
.imm = ((__u64) (IMM)) >> 32 })

+#define BPF_PSEUDO_MAP_FD 1
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c721d1f4d3cc..04d327a9a0d6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,10 +125,15 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
/* single container for all structs
* one verifier_env per bpf_check() call
*/
struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
};

/* verbose verifier prints what it's seeing
@@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn)
}
}

+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ fdput(f);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* remember this map */
+ env->used_maps[env->used_map_cnt++] = map;
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_bpf_prog_info()
+ */
+ atomic_inc(&map->refcnt);
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
@@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (!env)
return -ENOMEM;

+ env->prog = prog;
+
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);

@@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
log_level = 0;
}

+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
/* ret = do_check(env); */

+skip_full_check:
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
@@ -357,11 +479,36 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
goto free_log_buf;
}

+ if (ret == 0 && env->used_map_cnt) {
+ /* if program passed verifier, update used_maps in bpf_prog_info */
+ prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
+
+ if (!prog->aux->used_maps) {
+ ret = -ENOMEM;
+ goto free_log_buf;
+ }
+
+ memcpy(prog->aux->used_maps, env->used_maps,
+ sizeof(env->used_maps[0]) * env->used_map_cnt);
+ prog->aux->used_map_cnt = env->used_map_cnt;
+
+ /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+ * bpf_ld_imm64 instructions
+ */
+ convert_pseudo_ld_imm64(env);
+ }

free_log_buf:
if (log_level)
vfree(log_buf);
free_env:
+ if (!prog->aux->used_maps)
+ /* if we didn't copy map pointers into bpf_prog_info, release
+ * them now. Otherwise free_bpf_prog_info() will release them.
+ */
+ release_maps(env);
kfree(env);
mutex_unlock(&bpf_verifier_lock);
return ret;
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:04 PM9/15/14
to
eBPF programs are similar to kernel modules. They are loaded by the user
process and automatically unloaded when process exits. Each eBPF program is
a safe run-to-completion set of instructions. eBPF verifier statically
determines that the program terminates and is safe to execute.

The following syscall wrapper can be used to load the program:
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_cnt,
const char *license)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = insns,
.insn_cnt = insn_cnt,
.license = license,
};

return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}
where 'insns' is an array of eBPF instructions and 'license' is a string
that must be GPL compatible to call helper functions marked gpl_only

Upon succesful load the syscall returns prog_fd.
Use close(prog_fd) to unload the program.

User space tests and examples follow in the later patches

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 38 +++++++++++
include/linux/filter.h | 8 +--
include/uapi/linux/bpf.h | 26 ++++++++
kernel/bpf/core.c | 29 ++++----
kernel/bpf/syscall.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 245 insertions(+), 20 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2887f3f9da59..92979182be81 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -46,4 +46,42 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
struct bpf_map *bpf_map_get(struct fd f);

+/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
+ * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
+ * instructions after verifying
+ */
+struct bpf_func_proto {
+ u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+ bool gpl_only;
+};
+
+struct bpf_verifier_ops {
+ /* return eBPF function prototype for verification */
+ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
+};
+
+struct bpf_prog_type_list {
+ struct list_head list_node;
+ struct bpf_verifier_ops *ops;
+ enum bpf_prog_type type;
+};
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+
+struct bpf_prog;
+
+struct bpf_prog_aux {
+ atomic_t refcnt;
+ bool is_gpl_compatible;
+ enum bpf_prog_type prog_type;
+ struct bpf_verifier_ops *ops;
+ struct bpf_map **used_maps;
+ u32 used_map_cnt;
+ struct bpf_prog *prog;
+ struct work_struct work;
+};
+
+void bpf_prog_put(struct bpf_prog *prog);
+struct bpf_prog *bpf_prog_get(u32 ufd);
+
#endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1a0bc6d134d7..4ffc0958d85e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -21,6 +21,7 @@
struct sk_buff;
struct sock;
struct seccomp_data;
+struct bpf_prog_aux;

/* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -300,17 +301,12 @@ struct bpf_binary_header {
u8 image[];
};

-struct bpf_work_struct {
- struct bpf_prog *prog;
- struct work_struct work;
-};
-
struct bpf_prog {
u16 pages; /* Number of allocated pages */
bool jited; /* Is our filter JIT'ed? */
u32 len; /* Number of filter blocks */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
- struct bpf_work_struct *work; /* Deferred free work struct */
+ struct bpf_prog_aux *aux; /* Auxiliary fields */
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct bpf_insn *filter);
/* Instructions for interpreter */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d4ba8bb66bd4..a9d56d128b8c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -99,12 +99,23 @@ enum bpf_cmd {
* returns zero and stores next key or negative error
*/
BPF_MAP_GET_NEXT_KEY,
+
+ /* verify and load eBPF program
+ * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size)
+ * Using attr->prog_type, attr->insns, attr->license
+ * returns fd or negative error
+ */
+ BPF_PROG_LOAD,
};

enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
};

+enum bpf_prog_type {
+ BPF_PROG_TYPE_UNSPEC,
+};
+
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
@@ -121,6 +132,21 @@ union bpf_attr {
void __user *next_key;
};
};
+
+ struct { /* anonymous struct used by BPF_PROG_LOAD command */
+ __u32 prog_type; /* one of enum bpf_prog_type */
+ __u32 insn_cnt;
+ const struct bpf_insn __user *insns;
+ const char __user *license;
+ };
+};
+
+/* integer value in 'imm' field of BPF_CALL instruction selects which helper
+ * function eBPF program intends to call
+ */
+enum bpf_func_id {
+ BPF_FUNC_unspec,
+ __BPF_FUNC_MAX_ID,
};

#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8b7002488251..f0c30c59b317 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -27,6 +27,7 @@
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <asm/unaligned.h>
+#include <linux/bpf.h>

/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
gfp_extra_flags;
- struct bpf_work_struct *ws;
+ struct bpf_prog_aux *aux;
struct bpf_prog *fp;

size = round_up(size, PAGE_SIZE);
@@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (fp == NULL)
return NULL;

- ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags);
- if (ws == NULL) {
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ if (aux == NULL) {
vfree(fp);
return NULL;
}

fp->pages = size / PAGE_SIZE;
- fp->work = ws;
+ fp->aux = aux;

return fp;
}
@@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = size / PAGE_SIZE;

- /* We keep fp->work from fp_old around in the new
+ /* We keep fp->aux from fp_old around in the new
* reallocated structure.
*/
- fp_old->work = NULL;
+ fp_old->aux = NULL;
__bpf_prog_free(fp_old);
}

@@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc);

void __bpf_prog_free(struct bpf_prog *fp)
{
- kfree(fp->work);
+ kfree(fp->aux);
vfree(fp);
}
EXPORT_SYMBOL_GPL(__bpf_prog_free);
@@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);

static void bpf_prog_free_deferred(struct work_struct *work)
{
- struct bpf_work_struct *ws;
+ struct bpf_prog_aux *aux;

- ws = container_of(work, struct bpf_work_struct, work);
- bpf_jit_free(ws->prog);
+ aux = container_of(work, struct bpf_prog_aux, work);
+ bpf_jit_free(aux->prog);
}

/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
- struct bpf_work_struct *ws = fp->work;
+ struct bpf_prog_aux *aux = fp->aux;

- INIT_WORK(&ws->work, bpf_prog_free_deferred);
- ws->prog = fp;
- schedule_work(&ws->work);
+ INIT_WORK(&aux->work, bpf_prog_free_deferred);
+ aux->prog = fp;
+ schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fdf7bd79cf76..c3aa3264e235 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -14,6 +14,8 @@
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>

static LIST_HEAD(bpf_map_types);

@@ -328,6 +330,165 @@ err_put:
return err;
}

+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ struct bpf_prog_type_list *tl;
+
+ list_for_each_entry(tl, &bpf_prog_types, list_node) {
+ if (tl->type == type) {
+ prog->aux->ops = tl->ops;
+ prog->aux->prog_type = type;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++)
+ bpf_map_put(aux->used_maps[i]);
+
+ kfree(aux->used_maps);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ free_used_maps(prog->aux);
+ bpf_prog_free(prog);
+ }
+}
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+ .release = bpf_prog_release,
+};
+
+static struct bpf_prog *get_prog(struct fd f)
+{
+ struct bpf_prog *prog;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_prog_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ prog = f.file->private_data;
+
+ return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_prog *prog;
+
+ prog = get_prog(f);
+
+ if (IS_ERR(prog))
+ return prog;
+
+ atomic_inc(&prog->aux->refcnt);
+ fdput(f);
+ return prog;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_PROG_LOAD_LAST_FIELD license
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+ enum bpf_prog_type type = attr->prog_type;
+ struct bpf_prog *prog;
+ int err;
+ char license[128];
+ bool is_gpl;
+
+ if (CHECK_ATTR(BPF_PROG_LOAD))
+ return -EINVAL;
+
+ /* copy eBPF program license from user space */
+ if (strncpy_from_user(license, attr->license, sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, attr->insns,
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = false;
+
+ atomic_set(&prog->aux->refcnt, 1);
+ prog->aux->is_gpl_compatible = is_gpl;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog;
+
+ /* run eBPF verifier */
+ /* err = bpf_check(prog, tb); */
+
+ if (err < 0)
+ goto free_used_maps;
+
+ /* eBPF program is ready to be JITed */
+ bpf_prog_select_runtime(prog);
+
+ err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_used_maps;
+
+ return err;
+
+free_used_maps:
+ free_used_maps(prog->aux);
+free_prog:
+ bpf_prog_free(prog);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr *attr;
@@ -369,6 +530,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(attr);
break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(attr);
+ break;
default:
err = -EINVAL;
break;
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 15, 2014, 3:30:04 PM9/15/14
to
in native eBPF programs userspace is using pseudo BPF_CALL instructions
which encode one of 'enum bpf_func_id' inside insn->imm field.
Verifier checks that program using correct function arguments to given func_id.
If all checks passed, kernel needs to fixup BPF_CALL->imm fields by
replacing func_id with in-kernel function pointer.
eBPF interpreter just calls the function.

In-kernel eBPF users continue to use generic BPF_CALL.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c3aa3264e235..158d3aca29e4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -351,6 +351,40 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl)
list_add(&tl->list_node, &bpf_prog_types);
}

+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ * insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+ int i;
+
+ for (i = 0; i < prog->len; i++) {
+ struct bpf_insn *insn = &prog->insnsi[i];
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ /* we reach here when program has bpf_call instructions
+ * and it passed bpf_check(), means that
+ * ops->get_func_proto must have been supplied, check it
+ */
+ BUG_ON(!prog->aux->ops->get_func_proto);
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ BUG_ON(!fn->func);
+ insn->imm = fn->func - __bpf_call_base;
+ }
+ }
+}
+
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
@@ -471,6 +505,9 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;

+ /* fixup BPF_CALL->imm field */
+ fixup_bpf_calls(prog);
+
/* eBPF program is ready to be JITed */
bpf_prog_select_runtime(prog);

--
1.7.9.5

David Miller

unread,
Sep 16, 2014, 3:20:02 PM9/16/14
to
From: Alexei Starovoitov <a...@plumgrid.com>
Date: Mon, 15 Sep 2014 12:18:33 -0700

> @@ -83,6 +112,15 @@ union bpf_attr {
> __u32 value_size; /* size of value in bytes */
> __u32 max_entries; /* max number of entries in a map */
> };
> +
> + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
> + int map_fd;
> + void __user *key;
> + union {
> + void __user *value;
> + void __user *next_key;
> + };
> + };
> };
>
> #endif /* _UAPI__LINUX_BPF_H__ */

Depending upon the processor ABI, this change can increase the
alignment requirements of union bpf_attr. So the structure is not
compatible between patch #1 and patch #3 here.

Also, you haven't implemented any compat layer whatsoever for the
necessary translations. This happens because you are using pointers
which are different sized between 32-bit and 64-bit ABIs.

I would suggest you use instead something like "aligned_u64" since
these are just arbitrary userland cookies and using "aligned_u64"
vs. "u64" will make it so that you don't have to deal with the 64-bit
type alignment differences between x86-32 and x86-64 while writing the
compat wrappers (if any).

David Miller

unread,
Sep 16, 2014, 3:20:02 PM9/16/14
to
From: Alexei Starovoitov <a...@plumgrid.com>
Date: Mon, 15 Sep 2014 12:18:34 -0700

> @@ -121,6 +132,21 @@ union bpf_attr {
> void __user *next_key;
> };
> };
> +
> + struct { /* anonymous struct used by BPF_PROG_LOAD command */
> + __u32 prog_type; /* one of enum bpf_prog_type */
> + __u32 insn_cnt;
> + const struct bpf_insn __user *insns;
> + const char __user *license;
> + };
> +};

Again, these need syscall compat handling. You may want to redesign
the types so that perhaps no compat layer translations will be needed.

Alexei Starovoitov

unread,
Sep 16, 2014, 4:50:02 PM9/16/14
to
On Tue, Sep 16, 2014 at 12:16 PM, David Miller <da...@davemloft.net> wrote:
> From: Alexei Starovoitov <a...@plumgrid.com>
> Date: Mon, 15 Sep 2014 12:18:33 -0700
>
>> @@ -83,6 +112,15 @@ union bpf_attr {
>> __u32 value_size; /* size of value in bytes */
>> __u32 max_entries; /* max number of entries in a map */
>> };
>> +
>> + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
>> + int map_fd;
>> + void __user *key;
>> + union {
>> + void __user *value;
>> + void __user *next_key;
>> + };
>> + };
>> };
>>
>> #endif /* _UAPI__LINUX_BPF_H__ */
>
> Depending upon the processor ABI, this change can increase the
> alignment requirements of union bpf_attr. So the structure is not
> compatible between patch #1 and patch #3 here.

the union indeed changes alignment from patch #1 to #3,
but, imo, it is not a problem, since kernel does:
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(attr, uattr, size) != 0)
and then proceeds with further bpf_attr validation, so even
if user space alignment is 4 and user assumes contents
from patch 1, but kernel alignment is 8 and contents from
patch 3, it is still ok. Backwards compatibility is preserved.

> Also, you haven't implemented any compat layer whatsoever for the
> necessary translations. This happens because you are using pointers
> which are different sized between 32-bit and 64-bit ABIs.

I mentioned it in the cover letter:
- implemented and tested compat support (not part of this set)

I'll roll compat layer into the set to make it less confusing.

> I would suggest you use instead something like "aligned_u64" since
> these are just arbitrary userland cookies and using "aligned_u64"
> vs. "u64" will make it so that you don't have to deal with the 64-bit
> type alignment differences between x86-32 and x86-64 while writing the
> compat wrappers (if any).

I haven't thought of 'aligned_u64' for this case.
For counters and masks it would be perfect, but here user is
passing real pointers to key and value, so they have to
be 'void __user *', otherwise user would need to type cast
them, which I want to avoid.

>> + struct { /* anonymous struct used by BPF_PROG_LOAD command */
>> + __u32 prog_type; /* one of enum bpf_prog_type */
>> + __u32 insn_cnt;
>> + const struct bpf_insn __user *insns;
>> + const char __user *license;
>> + };
>> +};
>
> Again, these need syscall compat handling. You may want to redesign
> the types so that perhaps no compat layer translations will be needed.

'foo __user *' pointers vs 'aligned_u64'... It's a trade off.
I can make all pointer fields to be u64 and then, indeed,
no compat layer will be needed, but user would need to
do ugly type casts. I think compat layer is a better option.
It's simple enough. I'll roll it into the set and respin.

Thanks!

David Miller

unread,
Sep 16, 2014, 5:00:04 PM9/16/14
to
From: Alexei Starovoitov <a...@plumgrid.com>
Date: Tue, 16 Sep 2014 13:44:12 -0700

> the union indeed changes alignment from patch #1 to #3,
> but, imo, it is not a problem, since kernel does:

It changes the alignment of the datastructures in userspace.

> I haven't thought of 'aligned_u64' for this case.
> For counters and masks it would be perfect, but here user is
> passing real pointers to key and value, so they have to
> be 'void __user *', otherwise user would need to type cast
> them, which I want to avoid.

The cost of the compat layer must be considered and weighted
against this cast, which I think is really no big deal.

> I think compat layer is a better option.

It's overhead you'll have to support forever, I think you should
reconsider.

All of the "ugly casting" will be hidden, or can be hidden, in the
syscall wrappers and/or interfaces in userspace.

Alexei Starovoitov

unread,
Sep 16, 2014, 5:30:02 PM9/16/14
to
On Tue, Sep 16, 2014 at 1:54 PM, David Miller <da...@davemloft.net> wrote:
>
> The cost of the compat layer must be considered and weighted
> against this cast, which I think is really no big deal.
>
>> I think compat layer is a better option.
>
> It's overhead you'll have to support forever, I think you should
> reconsider.
>
> All of the "ugly casting" will be hidden, or can be hidden, in the
> syscall wrappers and/or interfaces in userspace.

ahh, ok. I thought you're strongly against any type of casts.
In such case I can get rid of 'union bpf_attr' as well and simply
define a struct per command, then syscall will look like:
sys_bpf(int cmd, void __user *attr, unsigned int size);
and uapi/linux/bpf.h will have:
struct bpf_prog_load_attr { /* for BPF_PROG_LOAD cmd */
__u32 prog_type;
__u32 insn_cnt;
__aligned_u64 insns;
__aligned_u64 license;
__u32 log_level;
__u32 log_size;
__aligned_u64 log_buf;
};
and similar for other commands.
no compat layer and type checking will be done
by syscall wrappers. Ok?

David Miller

unread,
Sep 16, 2014, 5:50:03 PM9/16/14
to
From: Alexei Starovoitov <a...@plumgrid.com>
Date: Tue, 16 Sep 2014 14:23:27 -0700

> no compat layer and type checking will be done
> by syscall wrappers. Ok?

Why are you against using strong typing just for everything other
than the user pointer blobs?

I don't understand the resistence to my suggestion to just use
aligned_u64 instead of "void __user *" in the union members.

Alexei Starovoitov

unread,
Sep 16, 2014, 6:10:04 PM9/16/14
to
On Tue, Sep 16, 2014 at 2:49 PM, David Miller <da...@davemloft.net> wrote:
> From: Alexei Starovoitov <a...@plumgrid.com>
> Date: Tue, 16 Sep 2014 14:23:27 -0700
>
>> no compat layer and type checking will be done
>> by syscall wrappers. Ok?
>
> Why are you against using strong typing just for everything other
> than the user pointer blobs?
>
> I don't understand the resistence to my suggestion to just use
> aligned_u64 instead of "void __user *" in the union members.

All these different variants are ok to me. There are pros and
cons to either approach. I'm not against strong typing.
I just thought it would be cleaner not to use 'union' and
was asking for opinion. That's all. Sure, I will keep 'union'
and only change pointers to __aligned_u64.

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:01 PM9/16/14
to
eBPF programs are similar to kernel modules. They are loaded by the user
process and automatically unloaded when process exits. Each eBPF program is
a safe run-to-completion set of instructions. eBPF verifier statically
determines that the program terminates and is safe to execute.

The following syscall wrapper can be used to load the program:
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_cnt,
const char *license)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = ptr_to_u64(insns),
.insn_cnt = insn_cnt,
.license = ptr_to_u64(license),
};

return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}
where 'insns' is an array of eBPF instructions and 'license' is a string
that must be GPL compatible to call helper functions marked gpl_only

Upon succesful load the syscall returns prog_fd.
Use close(prog_fd) to unload the program.

User space tests and examples follow in the later patches

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 38 +++++++++++
include/linux/filter.h | 8 +--
include/uapi/linux/bpf.h | 26 ++++++++
kernel/bpf/core.c | 29 ++++----
kernel/bpf/syscall.c | 165 ++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 246 insertions(+), 20 deletions(-)
index 395cabd2ca0a..424f442016e7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -99,12 +99,23 @@ enum bpf_cmd {
* returns zero and stores next key or negative error
*/
BPF_MAP_GET_NEXT_KEY,
+
+ /* verify and load eBPF program
+ * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size)
+ * Using attr->prog_type, attr->insns, attr->license
+ * returns fd or negative error
+ */
+ BPF_PROG_LOAD,
};

enum bpf_map_type {
BPF_MAP_TYPE_UNSPEC,
};

+enum bpf_prog_type {
+ BPF_PROG_TYPE_UNSPEC,
+};
+
union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */
@@ -121,6 +132,21 @@ union bpf_attr {
__aligned_u64 next_key;
};
};
+
+ struct { /* anonymous struct used by BPF_PROG_LOAD command */
+ __u32 prog_type; /* one of enum bpf_prog_type */
+ __u32 insn_cnt;
+ __aligned_u64 insns;
+ __aligned_u64 license;
+ };
} __attribute__((aligned(8)));

+/* integer value in 'imm' field of BPF_CALL instruction selects which helper
+ * function eBPF program intends to call
+ */
+enum bpf_func_id {
+ BPF_FUNC_unspec,
+ __BPF_FUNC_MAX_ID,
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d2d6491c21b5..a5b5c7ca0264 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -14,6 +14,8 @@
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>

static LIST_HEAD(bpf_map_types);

@@ -334,6 +336,166 @@ err_put:
+ int i;
+
+ if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = false;
+
+ atomic_set(&prog->aux->refcnt, 1);
+ prog->aux->is_gpl_compatible = is_gpl;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog;
+
+ /* run eBPF verifier */
+ /* err = bpf_check(prog, tb); */
+
+ if (err < 0)
+ goto free_used_maps;
+
+ /* eBPF program is ready to be JITed */
+ bpf_prog_select_runtime(prog);
+
+ err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_used_maps;
+
+ return err;
+
+free_used_maps:
+ free_used_maps(prog->aux);
+free_prog:
+ bpf_prog_free(prog);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -370,6 +532,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(&attr);
+ break;
default:
err = -EINVAL;
break;
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:01 PM9/16/14
to
BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.

'maps' is a generic storage of different types for sharing data between kernel
and userspace.

Userspace example:
/* this syscall wrapper creates a map with given type and attributes
* and returns map_fd on success.
* use close(map_fd) to delete the map
*/
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};

return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}

syscall is using 'union bpf_attr' to be backwards compatible with future
extensions. Different syscall commands will use different attributes.

More details in Documentation/networking/filter.txt and in manpage

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
Documentation/networking/filter.txt | 39 ++++++++++
include/linux/bpf.h | 41 ++++++++++
include/uapi/linux/bpf.h | 23 ++++++
kernel/bpf/Makefile | 2 +-
kernel/bpf/syscall.c | 144 +++++++++++++++++++++++++++++++++++
5 files changed, 248 insertions(+), 1 deletion(-)
+ struct work_struct work;
+};
+
+struct bpf_map_type_list {
+ struct list_head list_node;
+ struct bpf_map_ops *ops;
+ enum bpf_map_type type;
+};
+
+void bpf_register_map_type(struct bpf_map_type_list *tl);
+void bpf_map_put(struct bpf_map *map);
+
+#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 479ed0b6be16..f58a10f9670c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -62,4 +62,27 @@ struct bpf_insn {
__s32 imm; /* signed immediate constant */
};

+/* BPF syscall commands */
+enum bpf_cmd {
+ /* create a map with given type and attributes
+ * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
+ * returns fd or negative error
+ * map is deleted when fd is closed
+ */
+ BPF_MAP_CREATE,
+};
+
+enum bpf_map_type {
+ BPF_MAP_TYPE_UNSPEC,
+};
+
+union bpf_attr {
+ struct { /* anonymous struct used by BPF_MAP_CREATE command */
+ __u32 map_type; /* one of enum bpf_map_type */
+ __u32 key_size; /* size of key in bytes */
+ __u32 value_size; /* size of value in bytes */
+ __u32 max_entries; /* max number of entries in a map */
+ };
+} __attribute__((aligned(8)));
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..e9f7334ed07a 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o
+obj-y := core.o syscall.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..328a45a6d038
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,144 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+
+ bpf_map_put(map);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ map->ops->map_free(map);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr attr = {};
+ int err;
+
+ /* the syscall is limited to root temporarily. This restriction will be
+ * lifted when security audit is clean. Note that eBPF+tracing must have
+ * this restriction, since it may pass kernel data to user space
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* newer userspace cannot run with older kernel */
+ if (size > sizeof(attr))
+ return -EINVAL;
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ if (copy_from_user(&attr, uattr, size) != 0)
+ return -EFAULT;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(&attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:01 PM9/16/14
to
'maps' is a generic storage of different types for sharing data between kernel
and userspace.

The maps are accessed from user space via BPF syscall, which has commands:

- create a map with given type and attributes
fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
returns fd or negative error

- lookup key in a given map referenced by fd
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error

- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error

- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key

- iterate map elements (based on input key return next_key)
err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->next_key

- close(fd) deletes the map

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 8 ++
include/uapi/linux/bpf.h | 38 ++++++++
kernel/bpf/syscall.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 281 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 48014a71f0fe..2887f3f9da59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -9,6 +9,7 @@

#include <uapi/linux/bpf.h>
#include <linux/workqueue.h>
+#include <linux/file.h>

struct bpf_map;

@@ -17,6 +18,12 @@ struct bpf_map_ops {
/* funcs callable from userspace (via syscall) */
struct bpf_map *(*map_alloc)(union bpf_attr *attr);
void (*map_free)(struct bpf_map *);
+ int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+
+ /* funcs callable from userspace and from eBPF programs */
+ void *(*map_lookup_elem)(struct bpf_map *map, void *key);
+ int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+ int (*map_delete_elem)(struct bpf_map *map, void *key);
};

struct bpf_map {
@@ -37,5 +44,6 @@ struct bpf_map_type_list {

void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
+struct bpf_map *bpf_map_get(struct fd f);

#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f58a10f9670c..395cabd2ca0a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -70,6 +70,35 @@ enum bpf_cmd {
* map is deleted when fd is closed
*/
BPF_MAP_CREATE,
+
+ /* lookup key in a given map
+ * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->value
+ * returns zero and stores found elem into value
+ * or negative error
+ */
+ BPF_MAP_LOOKUP_ELEM,
+
+ /* create or update key/value pair in a given map
+ * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->value
+ * returns zero or negative error
+ */
+ BPF_MAP_UPDATE_ELEM,
+
+ /* find and delete elem by key in a given map
+ * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key
+ * returns zero or negative error
+ */
+ BPF_MAP_DELETE_ELEM,
+
+ /* lookup key in a given map and return next key
+ * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
+ * Using attr->map_fd, attr->key, attr->next_key
+ * returns zero and stores next key or negative error
+ */
+ BPF_MAP_GET_NEXT_KEY,
};

enum bpf_map_type {
@@ -83,6 +112,15 @@ union bpf_attr {
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
};
+
+ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+ __u32 map_fd;
+ __aligned_u64 key;
+ union {
+ __aligned_u64 value;
+ __aligned_u64 next_key;
+ };
+ };
} __attribute__((aligned(8)));

#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 328a45a6d038..d2d6491c21b5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
+#include <linux/file.h>

static LIST_HEAD(bpf_map_types);

@@ -111,6 +112,228 @@ free_map:
return err;
}

+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+ struct bpf_map *map;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = f.file->private_data;
+
+ return map;
+}
+
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+ return (void __user *) (unsigned long) val;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ESRCH;
+ rcu_read_lock();
+ value = map->ops->map_lookup_elem(map, key);
+ if (!value)
+ goto err_unlock;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto err_unlock;
+
+ err = 0;
+
+err_unlock:
+ rcu_read_unlock();
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *unext_key = u64_to_ptr(attr->next_key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *next_key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -135,6 +358,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_CREATE:
err = map_create(&attr);
break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(&attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(&attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(&attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(&attr);
+ break;
default:
err = -EINVAL;
break;

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:02 PM9/16/14
to
in native eBPF programs userspace is using pseudo BPF_CALL instructions
which encode one of 'enum bpf_func_id' inside insn->imm field.
Verifier checks that program using correct function arguments to given func_id.
If all checks passed, kernel needs to fixup BPF_CALL->imm fields by
replacing func_id with in-kernel function pointer.
eBPF interpreter just calls the function.

In-kernel eBPF users continue to use generic BPF_CALL.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++
1 file changed, 37 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a5b5c7ca0264..5a45cdc511e1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -357,6 +357,40 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl)
list_add(&tl->list_node, &bpf_prog_types);
}

+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ * insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+ int i;
+
+ for (i = 0; i < prog->len; i++) {
+ struct bpf_insn *insn = &prog->insnsi[i];
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ /* we reach here when program has bpf_call instructions
+ * and it passed bpf_check(), means that
+ * ops->get_func_proto must have been supplied, check it
+ */
+ BUG_ON(!prog->aux->ops->get_func_proto);
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ BUG_ON(!fn->func);
+ insn->imm = fn->func - __bpf_call_base;
+ }
+ }
+}
+
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
@@ -478,6 +512,9 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;

+ /* fixup BPF_CALL->imm field */
+ fixup_bpf_calls(prog);
+
/* eBPF program is ready to be JITed */
bpf_prog_select_runtime(prog);

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:02 PM9/16/14
to
1.
the library includes a trivial set of BPF syscall wrappers:
int bpf_create_map(int key_size, int value_size, int max_entries);
int bpf_update_elem(int fd, void *key, void *value);
int bpf_lookup_elem(int fd, void *key, void *value);
int bpf_delete_elem(int fd, void *key);
int bpf_get_next_key(int fd, void *key, void *next_key);
int bpf_prog_load(enum bpf_prog_type prog_type,
const struct sock_filter_int *insns, int insn_len,
const char *license);
bpf_prog_load() stores verifier log into global bpf_log_buf[] array

and BPF_*() macros to build instructions

2.
test stubs configure eBPF infra with 'unspec' map and program types.
These are fake types used by user space testsuite only.

3.
verifier tests valid and invalid programs and expects predefined
error log messages from kernel.
40 tests so far.

$ sudo ./test_verifier
#0 add+sub+mul OK
#1 unreachable OK
#2 unreachable2 OK
#3 out of range jump OK
#4 out of range jump2 OK
#5 test1 ld_imm64 OK
...

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/Makefile | 4 +
kernel/bpf/test_stub.c | 116 +++++++++
lib/Kconfig.debug | 3 +-
samples/bpf/Makefile | 12 +
samples/bpf/libbpf.c | 94 ++++++++
samples/bpf/libbpf.h | 172 ++++++++++++++
samples/bpf/test_verifier.c | 548 +++++++++++++++++++++++++++++++++++++++++++
7 files changed, 948 insertions(+), 1 deletion(-)
create mode 100644 kernel/bpf/test_stub.c
create mode 100644 samples/bpf/Makefile
create mode 100644 samples/bpf/libbpf.c
create mode 100644 samples/bpf/libbpf.h
create mode 100644 samples/bpf/test_verifier.c

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3c726b0995b7..45427239f375 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
obj-y := core.o syscall.o verifier.o
+
+ifdef CONFIG_TEST_BPF
+obj-y += test_stub.o
+endif
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..fcaddff4003e
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bpf.h>
+
+/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
+ * to be used by user space verifier testsuite
+ */
+struct bpf_context {
+ u64 arg1;
+ u64 arg2;
+};
+
+static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+static struct bpf_func_proto test_funcs[] = {
+ [BPF_FUNC_unspec] = {
+ .func = test_func,
+ .gpl_only = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ },
+};
+
+static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
+{
+ if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+ return NULL;
+ return &test_funcs[func_id];
+}
+
+static const struct bpf_context_access {
+ int size;
+ enum bpf_access_type type;
+} test_ctx_access[] = {
+ [offsetof(struct bpf_context, arg1)] = {
+ FIELD_SIZEOF(struct bpf_context, arg1),
+ BPF_READ
+ },
+ [offsetof(struct bpf_context, arg2)] = {
+ FIELD_SIZEOF(struct bpf_context, arg2),
+ BPF_READ
+ },
+};
+
+static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ const struct bpf_context_access *access;
+
+ if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
+ return false;
+
+ access = &test_ctx_access[off];
+ if (access->size == size && (access->type & type))
+ return true;
+
+ return false;
+}
+
+static struct bpf_verifier_ops test_ops = {
+ .get_func_proto = test_func_proto,
+ .is_valid_access = test_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl_prog = {
+ .ops = &test_ops,
+ .type = BPF_PROG_TYPE_UNSPEC,
+};
+
+static struct bpf_map *test_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+
+ map = kzalloc(sizeof(*map), GFP_USER);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->key_size = attr->key_size;
+ map->value_size = attr->value_size;
+ map->max_entries = attr->max_entries;
+ return map;
+}
+
new file mode 100644
index 000000000000..634391797856
--- /dev/null
+++ b/samples/bpf/Makefile
@@ -0,0 +1,12 @@
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := test_verifier
+
+test_verifier-objs := test_verifier.o libbpf.o
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS += -I$(objtree)/usr/include
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
new file mode 100644
index 000000000000..ff6504420738
--- /dev/null
+++ b/samples/bpf/libbpf.c
@@ -0,0 +1,94 @@
+/* eBPF mini library */
+#include <stdlib.h>
+#include <stdio.h>
+#include <linux/unistd.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/netlink.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include "libbpf.h"
+
+static __u64 ptr_to_u64(void *ptr)
+{
+ return (__u64) (unsigned long) ptr;
+}
+
+int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
+ int max_entries)
+{
+ union bpf_attr attr = {
+ .map_type = map_type,
+ .key_size = key_size,
+ .value_size = value_size,
+ .max_entries = max_entries
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+
+int bpf_update_elem(int fd, void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .value = ptr_to_u64(value),
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_lookup_elem(int fd, void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .value = ptr_to_u64(value),
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_delete_elem(int fd, void *key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_get_next_key(int fd, void *key, void *next_key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .next_key = ptr_to_u64(next_key),
+ };
+
+ return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+}
+
+#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
+
+char bpf_log_buf[LOG_BUF_SIZE];
+
+int bpf_prog_load(enum bpf_prog_type prog_type,
+ const struct bpf_insn *insns, int prog_len,
+ const char *license)
+{
+ union bpf_attr attr = {
+ .prog_type = prog_type,
+ .insns = ptr_to_u64((void *) insns),
+ .insn_cnt = prog_len / sizeof(struct bpf_insn),
+ .license = ptr_to_u64((void *) license),
+ .log_buf = ptr_to_u64(bpf_log_buf),
+ .log_size = LOG_BUF_SIZE,
+ .log_level = 1,
+ };
+
+ bpf_log_buf[0] = 0;
+
+ return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+}
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
new file mode 100644
index 000000000000..8a31babeca5d
--- /dev/null
+++ b/samples/bpf/libbpf.h
@@ -0,0 +1,172 @@
+/* eBPF mini library */
+#ifndef __LIBBPF_H
+#define __LIBBPF_H
+
+struct bpf_insn;
+
+int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
+ int max_entries);
+int bpf_update_elem(int fd, void *key, void *value);
+int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_delete_elem(int fd, void *key);
+int bpf_get_next_key(int fd, void *key, void *next_key);
+
+int bpf_prog_load(enum bpf_prog_type prog_type,
+ .imm = ((__u64) (IMM)) >> 32 })
+
+#define BPF_PSEUDO_MAP_FD 1
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
new file mode 100644
index 000000000000..d10992e2740e
--- /dev/null
+++ b/samples/bpf/test_verifier.c
@@ -0,0 +1,548 @@
+/*
+ * Testsuite for eBPF verifier
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+ int prog_fd, i;
+
+ }
+ } else {
+ if (prog_fd >= 0) {
+ printf("FAIL\nunexpected success to load\n");
+ printf("%s", bpf_log_buf);
+ goto fail;
+ }
+ if (strstr(bpf_log_buf, tests[i].errstr) == 0) {
+ printf("FAIL\nunexpected error message: %s",
+ bpf_log_buf);
+ goto fail;
+ }
+ }
+
+ printf("OK\n");
+fail:
+ if (map_fd >= 0)
+ close(map_fd);
+ close(prog_fd);
+
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ return test();
+}

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:01 PM9/16/14
to
More details in Documentation/networking/filter.txt and in kernel/bpf/verifier.c

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/bpf.h | 47 +++
kernel/bpf/verifier.c | 1003 ++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 1049 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dfeb36f8971..3cf91754a957 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -46,6 +46,31 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
void bpf_map_put(struct bpf_map *map);
struct bpf_map *bpf_map_get(struct fd f);

/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
* to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
* instructions after verifying
@@ -53,11 +78,33 @@ struct bpf_map *bpf_map_get(struct fd f);
struct bpf_func_proto {
u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
bool gpl_only;
+ enum bpf_return_type ret_type;
+ enum bpf_arg_type arg1_type;
+ enum bpf_arg_type arg2_type;
+ enum bpf_arg_type arg3_type;
+ enum bpf_arg_type arg4_type;
+ enum bpf_arg_type arg5_type;
+};
+
+/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
+ * the first argument to eBPF programs.
+ * For socket filters: 'struct bpf_context *' == 'struct sk_buff *'
+ */
+struct bpf_context;
+
+enum bpf_access_type {
+ BPF_READ = 1,
+ BPF_WRITE = 2
};

struct bpf_verifier_ops {
/* return eBPF function prototype for verification */
const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
+
+ /* return true if 'size' wide access at offset 'off' within bpf_context
+ * with 'type' (read or write) is allowed
+ */
+ bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
};

struct bpf_prog_type_list {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index effab7d1c7e8..663d02c6e23c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,6 +125,72 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */

/* single container for all structs
@@ -132,6 +198,9 @@
*/
struct verifier_env {
struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
};
@@ -160,6 +229,45 @@ static void verbose(const char *fmt, ...)
va_end(args);
}

+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ int i;
+
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ struct bpf_stack_slot *slot;
+ int i;
+
+ }
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ struct bpf_stack_slot *slot;
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+
+ if (slot->stype == STACK_SPILL) {
+ if (size != 8) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < 8; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_SPILL_PART) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] = slot->reg_st;
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->aux->ops->is_valid_access &&
+ env->prog->aux->ops->is_valid_access(off, size, t))
+ return 0;
+
+ }
+ return 0;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check whether atomic_add can read the memory */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1));
+
+ /* check whether atomic_add can write into the same memory */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1));
+ return 0;
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ }
+
+ return 0;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->ops->get_func_proto)
+ fn = env->prog->aux->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = true;
+
+ /* check dest operand */
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
/* return the map pointer stored inside BPF_LD_IMM64 instruction */
static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
{
@@ -313,6 +1110,34 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
return (struct bpf_map *) (unsigned long) imm64;
}

+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP));
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -492,6 +1317,181 @@ free_st:
return ret;
}

+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(insn);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ _(check_alu_op(regs, insn));
+
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+
+ _(check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK));
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ _(check_mem_access(env, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg));
+
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ _(check_xadd(env, insn));
+ insn_idx++;
+ continue;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ _(check_reg_arg(regs, insn->src_reg, SRC_OP));
+ /* check src2 operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check that memory (dst_reg + off) is writeable */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg));
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ _(check_reg_arg(regs, insn->dst_reg, SRC_OP));
+
+ /* check that memory (dst_reg + off) is writeable */
+ _(check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1));
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ _(check_call(env, insn->imm));
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ _(check_reg_arg(regs, BPF_REG_0, SRC_OP));
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ _(check_cond_jmp_op(env, insn, &insn_idx));
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ verbose("LD_ABS is not supported yet\n");
+ return -EINVAL;
+ } else if (mode == BPF_IMM) {
+ _(check_ld_imm(env, insn));
+ insn_idx++;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
@@ -645,9 +1645,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;

- /* ret = do_check(env); */
+ ret = do_check(env);

skip_full_check:
+ while (pop_stack(env, NULL) >= 0);

if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:02 PM9/16/14
to
eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions
to refer to process-local map_fd. Scan the program for such instructions and
if FDs are valid, convert them to 'struct bpf_map' pointers which will be used
by verifier to check access to maps in bpf_map_lookup/update() calls.
If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping
BPF_PSEUDO_MAP_FD flag.

Note that eBPF interpreter is generic and knows nothing about pseudo insns.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/linux/filter.h | 6 ++
kernel/bpf/verifier.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 153 insertions(+)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4ffc0958d85e..ca95abd2bed1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -145,6 +145,12 @@ struct bpf_prog_aux;
.off = 0, \
.imm = ((__u64) (IMM)) >> 32 })

+#define BPF_PSEUDO_MAP_FD 1
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 871edc1f2e1f..7227543e474b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,10 +125,15 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
/* single container for all structs
* one verifier_env per bpf_check() call
*/
struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
};

/* verbose verifier prints what it's seeing
@@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn)
}
}

+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ fdput(f);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* remember this map */
+ env->used_maps[env->used_map_cnt++] = map;
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_bpf_prog_info()
+ */
+ atomic_inc(&map->refcnt);
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
@@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (!env)
return -ENOMEM;

+ env->prog = prog;
+
/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);

@@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
log_level = 0;
}

+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
/* ret = do_check(env); */

+skip_full_check:
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:02 PM9/16/14
to
done as separate commit to ease conflict resolution

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:02 PM9/16/14
to
this patch adds all of eBPF verfier documentation and empty bpf_check()

The end goal for the verifier is to statically check safety of the program.

Verifier will catch:
- loops
- out of range jumps
- unreachable instructions
- invalid instructions
- uninitialized register access
- uninitialized stack access
- misaligned stack access
- out of range stack access
- invalid calling convention

More details in Documentation/networking/filter.txt

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
Documentation/networking/filter.txt | 224 +++++++++++++++++++++++++++++++++++
include/linux/bpf.h | 2 +
kernel/bpf/Makefile | 2 +-
kernel/bpf/syscall.c | 2 +-
kernel/bpf/verifier.c | 133 +++++++++++++++++++++
5 files changed, 361 insertions(+), 2 deletions(-)
create mode 100644 kernel/bpf/verifier.c

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 1900d29518f1..f1c90967f748 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -1001,6 +1001,99 @@ instruction that loads 64-bit immediate value into a dst_reg.
Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads
32-bit immediate value into a register.

'maps' is a generic storage of different types for sharing data between kernel
@@ -1040,6 +1133,137 @@ The map is defined by:
. key size in bytes
. value size in bytes

+Understanding eBPF verifier messages
+------------------------------------
+
+The following are few examples of invalid eBPF programs and verifier error
+messages as seen in the log:
+
+Program with unreachable instructions:
+static struct bpf_insn prog[] = {
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+};
+Error:
+ unreachable insn 1
+
+Program that reads uninitialized register:
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r0 = r2
+ R2 !read_ok
+
+Program that doesn't initialize R0 before exiting:
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r2 = r1
+ 1: (95) exit
+ R0 !read_ok
+
+Program that accesses stack out of bounds:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 +8) = 0
+ invalid stack off=8 size=8
+
+Program that doesn't initialize stack before passing its address into function:
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (bf) r2 = r10
+ 1: (07) r2 += -8
+ 2: (b7) r1 = 0x0
+ 3: (85) call 1
+ invalid indirect read from stack off -8+0 size 8
+
+Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ fd 0 is not pointing to valid bpf_map
+
+Program that doesn't check return value of map_lookup_elem() before accessing
+map element:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 0x0
+ 4: (85) call 1
+ 5: (7a) *(u64 *)(r0 +0) = 0
+ R0 invalid mem access 'map_value_or_null'
+
+Program that correctly checks map_lookup_elem() returned value for NULL, but
+accesses the memory with incorrect alignment:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+1
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +4) = 0
+ misaligned access off 4 size 8
+
+Program that correctly checks map_lookup_elem() returned value for NULL and
+accesses memory with correct alignment in one side of 'if' branch, but fails
+to do so in the other side of 'if' branch:
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+Error:
+ 0: (7a) *(u64 *)(r10 -8) = 0
+ 1: (bf) r2 = r10
+ 2: (07) r2 += -8
+ 3: (b7) r1 = 1
+ 4: (85) call 1
+ 5: (15) if r0 == 0x0 goto pc+2
+ R0=map_ptr R10=fp
+ 6: (7a) *(u64 *)(r0 +0) = 0
+ 7: (95) exit
+
+ from 5 to 8: R0=imm0 R10=fp
+ 8: (7a) *(u64 *)(r0 +0) = 1
+ R0 invalid mem access 'imm'
+
Testing
-------

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 92979182be81..9dfeb36f8971 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -83,5 +83,7 @@ struct bpf_prog_aux {

void bpf_prog_put(struct bpf_prog *prog);
struct bpf_prog *bpf_prog_get(u32 ufd);
+/* verify correctness of eBPF program */
+int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);

#endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e9f7334ed07a..3c726b0995b7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o syscall.o
+obj-y := core.o syscall.o verifier.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5a45cdc511e1..67b5e29f183e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -507,7 +507,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;

/* run eBPF verifier */
- /* err = bpf_check(prog, tb); */
+ err = bpf_check(prog, attr);

if (err < 0)
goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..d6f9c3d6b4d7
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,133 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:01 PM9/16/14
to
Hi All,

v12 -> v13:
- replaced 'foo __user *' pointers with __aligned_u64 (suggested by David)
- added __attribute__((aligned(8)) to 'union bpf_attr' to keep
constant alignment between patches
- updated manpage and syscall wrappers due to __aligned_u64
- rebased, retested on x64 with 32-bit and 64-bit userspace and on i386,
build tested on arm32,sparc64
eBPF programs are similar to kernel modules. They are loaded by the
user process and automatically unloaded when process exits. Each eBPF
program is a safe run-to-completion set of instructions. eBPF verifier
statically determines that the program terminates and is safe to
struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type;
__u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
};

struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
__u32 map_fd;
__aligned_u64 key;
union {
__aligned_u64 value;
__aligned_u64 next_key;
};
};

struct { /* anonymous struct used by BPF_PROG_LOAD command */
__u32 prog_type;
__u32 insn_cnt;
__aligned_u64 insns; /* 'const struct bpf_insn *' */
__aligned_u64 license; /* 'const char *' */
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
__aligned_u64 log_buf; /* user supplied 'char *' buffer */
};
} __attribute__((aligned(8)));

eBPF maps
maps is a generic storage of different types for sharing data between
kernel and userspace.

Any map type has the following attributes:
. type
. max number of elements
. key size in bytes
. value size in bytes

The following wrapper functions demonstrate how this syscall can be
used to access the maps. The functions use the cmd argument to invoke
different operations.

BPF_MAP_CREATE
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries
};

return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
int bpf_lookup_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
};

return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}
bpf() syscall looks up an element with given key in a map fd.
If element is found it returns zero and stores element's value
into value. If element is not found it returns -1 and sets
errno to ENOENT.

BPF_MAP_UPDATE_ELEM
int bpf_update_elem(int fd, void *key, void *value)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
};

return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}
The call creates or updates element with given key/value in a
map fd. On success it returns zero. On error, -1 is returned
and errno is set to EINVAL or EPERM or ENOMEM or E2BIG. E2BIG
indicates that number of elements in the map reached max_entries
limit specified at map creation time.

BPF_MAP_DELETE_ELEM
int bpf_delete_elem(int fd, void *key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
};

return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
}
The call deletes an element in a map fd with given key. Returns
zero on success. If element is not found it returns -1 and sets
errno to ENOENT.

BPF_MAP_GET_NEXT_KEY
int bpf_get_next_key(int fd, void *key, void *next_key)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.next_key = ptr_to_u64(next_key),
};

return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
}
The call looks up an element by key in a given map fd and
returns key of the next element into next_key pointer. If key is
not found, it return zero and returns key of the first element
into next_key. If key is the last element, it returns -1 and
sets errno to ENOENT. Other possible errno values are ENOMEM,
EFAULT, EPERM, EINVAL. This method can be used to iterate over
all elements of the map.

close(map_fd)
will delete the map map_fd. Exiting process will delete all
maps automatically.

eBPF programs
BPF_PROG_LOAD
This cmd is used to load eBPF program into the kernel.

char bpf_log_buf[LOG_BUF_SIZE];

int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int insn_cnt,
const char *license)
{
union bpf_attr attr = {
.prog_type = prog_type,
.insns = ptr_to_u64(insns),
.insn_cnt = insn_cnt,
.license = ptr_to_u64(license),
.log_buf = ptr_to_u64(bpf_log_buf),
license license string, which must be GPL compatible to call
helper functions marked gpl_only
printf("failed to create map '%s'\n", strerror(errno));
Linux 2014-09-16 BPF(2)

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:02 PM9/16/14
to
add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
struct {
...
__u32 log_level; /* verbosity level of eBPF verifier */
__u32 log_size; /* size of user buffer */
__aligned_u64 log_buf; /* user supplied 'char *buffer' */
};
};

when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.

'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:

BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
BPF_EXIT_INSN(),

will be rejected with the following multi-line message in log_buf:

0: (7a) *(u64 *)(r10 -8) = 0
1: (bf) r2 = r10
2: (07) r2 += -8
3: (b7) r1 = 0
4: (85) call 1
5: (15) if r0 == 0x0 goto pc+1
R0=map_ptr R10=fp
6: (7a) *(u64 *)(r0 +4) = 0
misaligned access off 4 size 8

The format of the output can change at any time as verifier evolves.

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
include/uapi/linux/bpf.h | 3 +
kernel/bpf/syscall.c | 2 +-
kernel/bpf/verifier.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 239 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 424f442016e7..31b0ac208a52 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -138,6 +138,9 @@ union bpf_attr {
__u32 insn_cnt;
__aligned_u64 insns;
__aligned_u64 license;
+ __u32 log_level; /* verbosity level of verifier */
+ __u32 log_size; /* size of user buffer */
+ __aligned_u64 log_buf; /* user supplied buffer */
};
} __attribute__((aligned(8)));

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 67b5e29f183e..c7be7163bd11 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -458,7 +458,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
}

/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD license
+#define BPF_PROG_LOAD_LAST_FIELD log_buf

static int bpf_prog_load(union bpf_attr *attr)
{
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6f9c3d6b4d7..871edc1f2e1f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,9 +125,244 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/

+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM) {
+ verbose("(%02x) r%d = 0x%x\n",
+ insn->code, insn->dst_reg, insn->imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
{
+ char __user *log_ubuf = NULL;
+ struct verifier_env *env;
int ret = -EINVAL;

+ if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto free_env;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto free_env;
+ } else {
+ log_level = 0;
+ }
+
+ /* ret = do_check(env); */
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+free_env:
+ kfree(env);
+ mutex_unlock(&bpf_verifier_lock);
return ret;
}
--
1.7.9.5

Alexei Starovoitov

unread,
Sep 16, 2014, 8:50:02 PM9/16/14
to
check that control flow graph of eBPF program is a directed acyclic graph

check_cfg() does:
- detect loops
- detect unreachable instructions
- check that program terminates with BPF_EXIT insn
- check that all branches are within program boundary

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
kernel/bpf/verifier.c | 183 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 183 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7227543e474b..effab7d1c7e8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
+ ret = -EINVAL; \
+ goto free_st; \
+ } \
+ if (st[w] == 0) { \
+ /* tree-edge */ \
+ st[T] = DISCOVERED | E; \
+ st[w] = DISCOVERED; \
+ PUSH_INT(w); \
+ goto peek_stack; \
+ } else if ((st[w] & 0xF0) == DISCOVERED) { \
+ verbose("back-edge from insn %d to %d\n", T, w); \
+ ret = -EINVAL; \
+ goto free_st; \
+ } else if (st[w] == EXPLORED) { \
+ /* forward- or cross-edge */ \
+ st[T] = DISCOVERED | E; \
+ } else { \
+ verbose("insn state internal bug\n"); \
+ ret = -EFAULT; \
+ goto free_st; \
+ } \
+ } while (0)
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int cur_stack = 0;
+ int *stack;
+ int ret = 0;
+ int *st;
+ int i, t;
+
+ st = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!st)
+ return -ENOMEM;
+
+ stack = kzalloc(sizeof(int) * insn_cnt, GFP_KERNEL);
+ if (!stack) {
+ kfree(st);
+ return -ENOMEM;
+ }
+
+ st[0] = DISCOVERED; /* mark 1st insn as discovered */
+ PUSH_INT(0);
+
+peek_stack:
+ while ((t = PEEK_INT()) != -1) {
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto free_st;
+ }
+ /* unconditional jump with single edge */
+ PUSH_INSN(t, t + insns[t].off + 1, FALLTHROUGH);
+ } else {
+ /* conditional jump with two edges */
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ PUSH_INSN(t, t + insns[t].off + 1, BRANCH);
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ PUSH_INSN(t, t + 1, FALLTHROUGH);
+ }
+
+mark_explored:
+ st[t] = EXPLORED;
+ if (POP_INT() == -1) {
+ verbose("pop_int internal bug\n");
+ ret = -EFAULT;
+ goto free_st;
+ }
+ }
+
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (st[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto free_st;
+ }
+ }
+
+free_st:
+ kfree(st);
+ kfree(stack);
+ return ret;
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
@@ -462,6 +641,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;

+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
/* ret = do_check(env); */

skip_full_check:
--
1.7.9.5

Daniel Borkmann

unread,
Sep 17, 2014, 3:00:01 AM9/17/14
to
I was looking to find a use case for this item, but couldn't find anything, so
this seems to be dead code?

Was it, so that each time you extend an uapi structure like above that you would
only access the structure up to BPF_PROG_LOAD_LAST_FIELD? That might not work for
old binaries using this ABI running on newer kernels where there are different
expectations of what BPF_PROG_LOAD_LAST_FIELD has been at the time of compilation.
So only because of the verifier error log (which are global vars here) we
now have to hold a eBPF-related mutex lock each time when attaching a program?

Also, if you really have to do the verifier error log, can't we spare ourself
most part of the textifying parts if you would encode the verifier log into a
normal structure array with eBPF specific error codes and then do all this
pretty printing in user space? Why is that impossible? I really think it's odd.

Daniel Borkmann

unread,
Sep 17, 2014, 3:20:01 AM9/17/14
to
On 09/10/2014 08:08 PM, Alexei Starovoitov wrote:
> On Wed, Sep 10, 2014 at 4:35 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>>
>> Since we already have an extensive BPF test suite, that is, lib/test_bpf.c,
>> which currently also does sanity checks for the classic BPF verifier, is
>> there a reason these verifier test cases cannot be extended/integrated there
>> as well but have to go to kernel/bpf/test_stub.c resp.
>> samples/bpf/test_verifier.c ?
>> I don't like that we put testing code into kernel/bpf/ whereas we already
>> have a BPF test infrastructure in the kernel elsewhere.
>
> yes. there is a reason. Verifier needs to be tested from user space,
> since it works on fds. Process local map_fd are part of the eBPF
> programs. Therefore one is testing things from kernel and
> another from userspace. We definitely need both.
> Currently there is no use case to call verifier from inside
> the kernel. I'm not sure there will be one. Verifier's main
> purpose is to check user supplied programs and provide
> humans an understandable error messages of what
> is 'unsafe' in particular program.
> Eventually we will integrate this verifier messages with
> program compilation. Like, the user would write a program
> in C then invoke a wrapper of compiler and verifier, which
> will point to lines in C code which are doing something
> wrong like loops or out of bounds access. Currently verifier
> complains about particular 'unsafe' instruction, but
> humans have hard time correlating asm to C.

That actually still doesn't answer my question why the test stub
cannot live in lib/test_bpf where we have our actual testing
framework for eBPF/BPF, also since you exactly only build test_stub.c
when TEST_BPF is enabled which is the Kconfig for lib/test_bpf.

Alexei Starovoitov

unread,
Sep 17, 2014, 12:10:02 PM9/17/14
to
On Tue, Sep 16, 2014 at 11:51 PM, Daniel Borkmann <dbor...@redhat.com> wrote:
>>
>> /* last field in 'union bpf_attr' used by this command */
>> -#define BPF_PROG_LOAD_LAST_FIELD license
>> +#define BPF_PROG_LOAD_LAST_FIELD log_buf
>
>
> I was looking to find a use case for this item, but couldn't find anything,
> so
> this seems to be dead code?

See CHECK_ATTR() macro and comment next to it in patch #1

> Was it, so that each time you extend an uapi structure like above that you
> would
> only access the structure up to BPF_PROG_LOAD_LAST_FIELD? That might not
> work for
> old binaries using this ABI running on newer kernels where there are
> different
> expectations of what BPF_PROG_LOAD_LAST_FIELD has been at the time of
> compilation.

exactly the opposite.
CHECK_ATTR() is checking that all fields beyond last for given
command are zero, so we can extend bpf_attr with new fields
added after last.
Transition from patch 4 to patch 7 and the hunk you quoted are
demonstrating exactly that. Say, userspace was compiled
with bpf_attr as defined in patch 4 and it populated fields all the way
till 'license', and kernel is compiled with patch 7. Kernel does:
union bpf_attr attr = {};
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
copy_from_user(&attr, uattr, size)
so newer fields (all the way till log_buf) stay zero and kernel
behavior is the same as it was in patch 4.
So older user space works as-is with newer kernel.
Another example:
say, we want to add another flag to lookup() method added in patch 3,
we just add another 'flags' field after 'value' field and adjust:
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
Older user apps stay binary compatible with newer kernel.
Does this explain things?

>> +
>> + /* grab the mutex to protect few globals used by verifier */
>> + mutex_lock(&bpf_verifier_lock);
>
>
> So only because of the verifier error log (which are global vars here) we
> now have to hold a eBPF-related mutex lock each time when attaching a
> program?

correct.
it's done on purpose to simplify verifier code.
User app is blocked in bpf syscall until verifier checks the program.
Not a big deal. I don't expect a lot of concurrent program loading.
If it somehow becomes an issue, when can fix it, but for now I think
less lines of verifier code is definitely a better trade off.

> Also, if you really have to do the verifier error log, can't we spare
> ourself
> most part of the textifying parts if you would encode the verifier log into
> a
> normal structure array with eBPF specific error codes and then do all this
> pretty printing in user space? Why is that impossible? I really think it's
> odd.

I thought I explained this already...
verifier log is not at all "an array of specific error codes".
verifier is printing the trace and state of what it's seeing
while analyzing the program. Very branchy program may
generate a trace log several times larger than program size.
pretty text is the most convenient way to pass it to user.
Theoretically we can come with some obscure log format for
this internal verifier state, but what do we get ? only additional
complexity. This obscure format will change just as text will
change, because verifier will evolve. If you're looking for a way
to fix this output into ABI, it's not possible. Verifier will
become more advanced in the future and it's trace whether
in text or in obscure encoded structs will change.
Therefore text is much simpler option.
Also consider the learning curve for somebody planning
to add new features to verifier. This trace log is a perfect way
to understand how verifier is working. Try simple program
with multiple branches and see what kind of log is dumped.

Another example:
To make verifier easier to review, in this patch set I didn't
include 'state pruning optimization' patch. That patch
will change the trace log, because it changes the way
verifier is working. If we had to introduce struct
encoding of trace log, it would need to be changed already.
So pretty text is the simplest and most convenient way.

Alexei Starovoitov

unread,
Sep 17, 2014, 12:20:04 PM9/17/14
to
On Wed, Sep 17, 2014 at 12:16 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>
>
> That actually still doesn't answer my question why the test stub
> cannot live in lib/test_bpf where we have our actual testing
> framework for eBPF/BPF, also since you exactly only build test_stub.c
> when TEST_BPF is enabled which is the Kconfig for lib/test_bpf.

multiple reasons:
1.
lib/test_bpf.c is a module, whereas test_stub.c is kernel builtin.

2.
I wasn't sure that reusing CONFIG_TEST_BPF for this
purpose was a good idea. May be it's better to introduce
CONFIG_BPF_VERIFIER_TEST_STUBS or something.

3.
kernel/bpf/test_stubs.c can be removed once real tracing
or socket use case is in.

Daniel Borkmann

unread,
Sep 17, 2014, 1:10:03 PM9/17/14
to
Ok, I see. Lets say, since the introduction of this syscall you have
added a couple of features and thus extended union bpf_attr where it
grew in size over time.

You built your shiny new binary on that uapi setting, and later on
decide to run it on an older kernel. What will happen is that in your
bpf syscall handler you will return with -EINVAL on that kernel right
away since the size of union bpf_attr is greater.

That would mean over time when new features will get added, applications
that want to make sure to run on _all_ kernels where the bpf syscall is
available have to make sure to either use the _initial_ version of
union bpf_attr in order to not get an -EINVAL, or worse they have
to probe though a syscall different versions of union bpf_attr if they
wish to make use of a particular feature until they do not get an -EINVAL
anymore.

I guess that might be problematic for an application developer that
wants to ship its application across different distributions usually
running different kernels. At least those people might then consider
holding a private copy of the _initial_ version of union bpf_attr in
their own source code, but it's not pleasant I guess.

I know you seem to have the constraint to run on NET-less systems, but
netlink could partially resolve that in the sense that it would allow
to at least load an eBPF program with initial feature set. Couldn't
there be some mechanism to make this interaction more pleasant? E.g.
in BPF extensions we can ask the kernel up to what extension it
supports and accordingly adapt to it up front. I know it's just a
/trivial/ example but have you thought about something on that kind for
the syscall?

Daniel Borkmann

unread,
Sep 17, 2014, 3:20:02 PM9/17/14
to
Hm, thinking out loudly ... perhaps this could be made a library problem.
Such that the library which wraps the syscall needs to be aware of a
marker where the initial version ends, and if the application doesn't
make use of any of the new features, it would just pass in the length up
to the marker as size attribute into the syscall. Similarly, if new
features are always added to the end of a structure and the library
truncates the overall-length after the last used member we might have
a chance to load something on older kernels, haven't tried that though.

Alexei Starovoitov

unread,
Sep 17, 2014, 3:40:04 PM9/17/14
to
On Wed, Sep 17, 2014 at 12:17 PM, Daniel Borkmann <dbor...@redhat.com> wrote:
> On 09/17/2014 07:03 PM, Daniel Borkmann wrote:
>>
>> You built your shiny new binary on that uapi setting, and later on
>> decide to run it on an older kernel. What will happen is that in your
>> bpf syscall handler you will return with -EINVAL on that kernel right
>> away since the size of union bpf_attr is greater.

that's a correct description of the code in this set.
What I mentioned in cover letter during v7 was:
"I've decided not to bother with forward compatiblity for now.
We can address it later the way perf_event_open did."
I was trying not open this can of worms :)

>> That would mean over time when new features will get added, applications
>> that want to make sure to run on _all_ kernels where the bpf syscall is
>> available have to make sure to either use the _initial_ version of
>> union bpf_attr in order to not get an -EINVAL, or worse they have
>> to probe though a syscall different versions of union bpf_attr if they
>> wish to make use of a particular feature until they do not get an -EINVAL
>> anymore.

Correct as well.
I say that would be the least of user space concerns.
If user app is built with newer bpf_attr and relies on it for some
functionality, it would need a lot more than probing.
Depending how big the new bpf_attr feature is, the app might
not have a workaround for older kernels at all.
So seeing EINVAL on older kernel might be a preferred way.

>> in BPF extensions we can ask the kernel up to what extension it
>> supports and accordingly adapt to it up front. I know it's just a
>> /trivial/ example but have you thought about something on that kind for
>> the syscall?

A 2nd option would be to do what perf_copy_attr() does:
when newer struct is coming from user space, kernel checks
that space beyond known last field is all zeros.
We can do the same, but I'm not convinced that it will
help userspace much. Newer user space can only be
_compiled_ with newer bpf_attr. It is still cannot use any new
features and it needs to make sure that all new fields are zero.
I'm not sure how it helps.
perf_event_open() also returns size. In that case it's helpful,
since it's one structure. In ebpf case we have multiple
structures under one union, so returning size of the whole
bpf_attr doesn't help one particular command.

> Hm, thinking out loudly ... perhaps this could be made a library problem.
> Such that the library which wraps the syscall needs to be aware of a
> marker where the initial version ends, and if the application doesn't
> make use of any of the new features, it would just pass in the length up
> to the marker as size attribute into the syscall. Similarly, if new
> features are always added to the end of a structure and the library
> truncates the overall-length after the last used member we might have
> a chance to load something on older kernels, haven't tried that though.

that's a 3rd option. I think it's cleaner than 2nd, since it solves it
completely from user space.
It can even be smarter than that. If this syscall wrapper library
sees that newer features are used and it can workaround them:
it can chop size and pass older fields into the older kernel
and when it returns, do a workaround based on newer fields.

Daniel Borkmann

unread,
Sep 17, 2014, 6:10:02 PM9/17/14
to
On 09/17/2014 06:17 PM, Alexei Starovoitov wrote:
> On Wed, Sep 17, 2014 at 12:16 AM, Daniel Borkmann <dbor...@redhat.com> wrote:
>>
>> That actually still doesn't answer my question why the test stub
>> cannot live in lib/test_bpf where we have our actual testing
>> framework for eBPF/BPF, also since you exactly only build test_stub.c
>> when TEST_BPF is enabled which is the Kconfig for lib/test_bpf.
>
> multiple reasons:
> 1.
> lib/test_bpf.c is a module, whereas test_stub.c is kernel builtin.
>
> 2.
> I wasn't sure that reusing CONFIG_TEST_BPF for this
> purpose was a good idea. May be it's better to introduce
> CONFIG_BPF_VERIFIER_TEST_STUBS or something.
>
> 3.
> kernel/bpf/test_stubs.c can be removed once real tracing
> or socket use case is in.

Yes, please, lets go for point 3 at the very least.

Alexei Starovoitov

unread,
Sep 17, 2014, 6:20:02 PM9/17/14
to
On Wed, Sep 17, 2014 at 2:59 PM, Daniel Borkmann <dbor...@redhat.com> wrote:
>>
>> 3.
>> kernel/bpf/test_stubs.c can be removed once real tracing
>> or socket use case is in.
>
>
> Yes, please, lets go for point 3 at the very least.

agree. test_stubs is a way to have verifier testsuite
as the first ebpf user in this patch set.
There are multiple ways of connecting ebpf to tracing/sockets
and these discussions cannot happen all at once.
So test_stubs to some degree is scaffolding to bring
other pieces in place slowly with required due diligence.
I don't think there will be a need for test_stubs when
tracing+ebpf is in place. That's why I didn't introduce
special prog_type and map_type for it and instead used
'unspec' type which is invalid type and used for testing
temporarily.
verifier testsuite, of course, will stay. It will switch from
'unspec' type to real types when they're ready.

Alexei Starovoitov

unread,
Sep 17, 2014, 7:50:01 PM9/17/14
to
On Wed, Sep 17, 2014 at 12:37 PM, Alexei Starovoitov <a...@plumgrid.com> wrote:
>
>> Hm, thinking out loudly ... perhaps this could be made a library problem.
>> Such that the library which wraps the syscall needs to be aware of a
>> marker where the initial version ends, and if the application doesn't
>> make use of any of the new features, it would just pass in the length up
>> to the marker as size attribute into the syscall. Similarly, if new
>> features are always added to the end of a structure and the library
>> truncates the overall-length after the last used member we might have
>> a chance to load something on older kernels, haven't tried that though.
>
> that's a 3rd option. I think it's cleaner than 2nd, since it solves it
> completely from user space.
> It can even be smarter than that. If this syscall wrapper library
> sees that newer features are used and it can workaround them:
> it can chop size and pass older fields into the older kernel
> and when it returns, do a workaround based on newer fields.

the more I think about 'new user space + old kernel' problem,
the more certain I am that kernel should not try to help
user space, since most of the time it's not going to be enough,
but additional code in kernel would need to be maintained.

syscall commands and size of bpf_attr is the least of problems.
New map_type and prog_type will be added, new helper
functions will be available to programs.
One would think that md5 of uapi/linux/bpf.h would be
enough to say that user app is compatible... In reality,
it's not. The 'state pruning' verifier optimization I've talked
about will not change a single bit in bpf.h, but it will be
able to recognize more programs as safe.
A program developed on a new kernel with more
advanced verifier will load just fine on new kernel, but
this valid program will not load on old kernel, only because
verifier is not smart enough. Now we would need a version
of verifier exposed all the way to user space?!
imo that's too much. I think for eBPF infra kernel
should only guarantee backwards compatibility
(old user space must work with new kernel) and that's it.
That's what this patch is trying to do.
Thoughts?

Daniel Borkmann

unread,
Sep 18, 2014, 2:50:02 AM9/18/14
to
Sure, you will never get a full compatibility on that regard
while backwards compatibility needs to be guaranteed on the
other hand. I looked at perf_copy_attr() implementation and I
think that we should mimic it in a very similar way as it
exactly solves what we need.

For example, it will return with -EINVAL for (size > PAGE_SIZE)
and (size < PERF_ATTR_SIZE_VER0) where PAGE_SIZE has been chosen
as an arbitrary hard upper limit where it is believed that it will
never grow beyond that large limit in future.

So this is a more loose constraint than what we currently do,
that is, -EINVAL on (size > sizeof(attr)) where attr is the
currently known size of a specific kernel. That would at least
be a start, you won't be able to cover everything though, but
it would allow to address the issue raised when running with
a basic feature set.

Alexei Starovoitov

unread,
Sep 18, 2014, 10:40:02 AM9/18/14
to
you missed my point. We should not 'do a start', since it
doesn't help user space in the long run and only makes
kernel more complex.

Daniel Borkmann

unread,
Sep 18, 2014, 11:00:02 AM9/18/14
to
On 09/18/2014 04:34 PM, Alexei Starovoitov wrote:
> On Wed, Sep 17, 2014 at 11:44 PM, Daniel Borkmann <dbor...@redhat.com> wrote:
...
>> Sure, you will never get a full compatibility on that regard
>> while backwards compatibility needs to be guaranteed on the
>> other hand. I looked at perf_copy_attr() implementation and I
>> think that we should mimic it in a very similar way as it
>> exactly solves what we need.
>>
>> For example, it will return with -EINVAL for (size > PAGE_SIZE)
>> and (size < PERF_ATTR_SIZE_VER0) where PAGE_SIZE has been chosen
>> as an arbitrary hard upper limit where it is believed that it will
>> never grow beyond that large limit in future.
>>
>> So this is a more loose constraint than what we currently do,
>> that is, -EINVAL on (size > sizeof(attr)) where attr is the
>> currently known size of a specific kernel. That would at least
>> be a start, you won't be able to cover everything though, but
>> it would allow to address the issue raised when running with
>> a basic feature set.
>
> you missed my point. We should not 'do a start', since it
> doesn't help user space in the long run and only makes
> kernel more complex.

Sorry, I don't think I missed your point. But if you see things
differently, fair enough, it was just a suggestion.

Alexei Starovoitov

unread,
Sep 18, 2014, 11:30:04 AM9/18/14
to
now you probably think I'm shutting you up. Sorry. Was not
my intention. Let me rephrase what I meant:
I think we should decide right now whether
'new user + old kernel' is really a problem we're going to
solve or not. If we decide to solve it, we need to have
a plan to solve it all the way. Partial fix for size of bpf_attr
is not a plan. It's something that is not addressing the problem
completely. Little bit of help is not useful for userspace. It
would need to deal with new types, verifier differences and
other things that I mentioned earlier. So we either decide
that we're going to spend time to solve all of them (not
necessarily today, but over long haul) or we're not doing
any of it.

Daniel Borkmann

unread,
Sep 18, 2014, 2:20:02 PM9/18/14
to
On 09/18/2014 05:24 PM, Alexei Starovoitov wrote:
...
> solve or not. If we decide to solve it, we need to have
> a plan to solve it all the way. Partial fix for size of bpf_attr
> is not a plan. It's something that is not addressing the problem
> completely. Little bit of help is not useful for userspace. It
> would need to deal with new types, verifier differences and
> other things that I mentioned earlier.

Hm, I don't think it would be a strict requirement to solve it
all the way, and I think that perf_event_open() with perf_copy_attr()
is not trying to do so either. It, however, is trying on a ``best
effort basis'' to still load something if new features are unused
by the binary (I guess you saw the comment in perf_copy_attr()).

Iff, e.g. due to new types we fail at the verifier stage, sure,
that's life since we only have backwards-compatible guarantee,
but in case we tried to use features we support, we're still able
to load the eBPF program while right now, we're rejecting it right
up-front. That's just my $0.02 ...
0 new messages