Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH v5 6/8] x86: modify ftrace function using the new int3-based framework

4 views
Skip to first unread message

Petr Mladek

unread,
Dec 3, 2013, 8:30:02 AM12/3/13
to
The commit fd4363fff3d9 (x86: Introduce int3 (breakpoint)-based instruction
patching) uses the same technique that has been used in ftrace since 08d636b
("ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine")

It would be great to use the new generic implementation and clean up the x86
ftrace code a bit.

Let's start with ftrace_modify_code. It does basically the same as text_poke_bp.
In addition, it checks whether the replaced code is the expected one to avoid
problems with modules. The checking code can be split from
ftrace_modify_code_direct.

Note that we do not longer need to set modifying_ftrace_code in
ftrace_update_ftrace_func. The int3 interrupt will be handled by
poke_int3_handler.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/kernel/ftrace.c | 120 +++++++++++++++++++----------------------------
1 file changed, 47 insertions(+), 73 deletions(-)

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d4bdd253fea7..5ade40e4a175 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -52,6 +52,33 @@ union ftrace_code_union {
} __attribute__((packed));
};

+/*
+ * Note: Due to modules and __init, code can
+ * disappear and change, we need to protect against faulting
+ * as well as code changing. We do this by using the
+ * probe_kernel_* functions.
+ */
+static int
+ftrace_check_code(unsigned long ip, unsigned const char *expected)
+{
+ unsigned char actual[MCOUNT_INSN_SIZE];
+
+ if (probe_kernel_read(actual, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ if (memcmp(actual, expected, MCOUNT_INSN_SIZE) != 0) {
+ WARN_ONCE(1,
+ "ftrace check failed: %x %x%x%x%x vs. %x %x%x%x%x\n",
+ actual[0],
+ actual[1], actual[2], actual[3], actual[4],
+ expected[0],
+ expected[1], expected[2], expected[3], expected[4]);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ftrace_calc_offset(long ip, long addr)
{
return (int)(addr - ip);
@@ -103,28 +130,12 @@ static int
ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code)
{
- unsigned char replaced[MCOUNT_INSN_SIZE];
-
- /*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
- */
+ int ret;

- /* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
+ ret = ftrace_check_code(ip, old_code);

/* replace the text with the new text */
- if (do_ftrace_mod_code(ip, new_code))
+ if (!ret && do_ftrace_mod_code(ip, new_code))
return -EPERM;

sync_core();
@@ -132,6 +143,22 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
return 0;
}

+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = ftrace_check_code(ip, old_code);
+
+ /* replace the text with the new text */
+ if (!ret)
+ ret = text_poke_bp((void *)ip, new_code, MCOUNT_INSN_SIZE,
+ (void *)ip + MCOUNT_INSN_SIZE);
+
+ return ret;
+}
+
int ftrace_make_nop(struct module *mod,
struct dyn_ftrace *rec, unsigned long addr)
{
@@ -202,10 +229,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
*/
atomic_t modifying_ftrace_code __read_mostly;

-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code);
-
/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
@@ -230,9 +253,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
new = ftrace_call_replace(ip, (unsigned long)func);

- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ret = ftrace_modify_code(ip, old, new);

/* Also update the regs callback function */
@@ -243,20 +263,9 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
ret = ftrace_modify_code(ip, old, new);
}

- atomic_dec(&modifying_ftrace_code);
-
return ret;
}

-static int is_ftrace_caller(unsigned long ip)
-{
- if (ip == (unsigned long)(&ftrace_call) ||
- ip == (unsigned long)(&ftrace_regs_call))
- return 1;
-
- return 0;
-}
-
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
@@ -266,13 +275,10 @@ static int is_ftrace_caller(unsigned long ip)
*/
int ftrace_int3_handler(struct pt_regs *regs)
{
- unsigned long ip;
-
if (WARN_ON_ONCE(!regs))
return 0;

- ip = regs->ip - 1;
- if (!ftrace_location(ip) && !is_ftrace_caller(ip))
+ if (!ftrace_location(regs->ip - 1))
return 0;

regs->ip += MCOUNT_INSN_SIZE - 1;
@@ -621,38 +627,6 @@ void ftrace_replace_code(int enable)
}
}

-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = add_break(ip, old_code);
- if (ret)
- goto out;
-
- run_sync();
-
- ret = add_update_code(ip, new_code);
- if (ret)
- goto fail_update;
-
- run_sync();
-
- ret = ftrace_write(ip, new_code, 1);
- if (ret) {
- ret = -EPERM;
- goto out;
- }
- run_sync();
- out:
- return ret;
-
- fail_update:
- probe_kernel_write((void *)ip, &old_code[0], 1);
- goto out;
-}
-
void arch_ftrace_update_code(int command)
{
/* See comment above by declaration of modifying_ftrace_code */
--
1.8.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Petr Mladek

unread,
Dec 3, 2013, 8:30:02 AM12/3/13
to
We would like to use text_poke_bp in ftrace. It might be called also during
boot when there is only one CPU and we do not need to sync the others.

The check is must to have because there are also disabled interrupts during
the boot. Then the call would cause a deadlock, see the warning in
"smp_call_function_many", kernel/smp.c:371.

The change is inspired by the code in arch/x86/kernel/ftrace.c.

Signed-off-by: Petr Mladek <pml...@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hi...@hitachi.com>
---
arch/x86/kernel/alternative.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 10a7ec0c66f9..2e47107b9055 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -602,6 +602,17 @@ static void do_sync_core(void *info)
sync_core();
}

+static void run_sync(void)
+{
+ /*
+ * We do not need to sync other cores during boot when there is only one
+ * CPU enabled. In fact, we must not because there are also disabled
+ * interrupts. The call would fail because of a potential deadlock.
+ */
+ if (num_online_cpus() != 1)
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;

@@ -664,7 +675,7 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
if (unlikely(ret))
goto fail;

- on_each_cpu(do_sync_core, NULL, 1);
+ run_sync();

if (len - sizeof(int3) > 0) {
/* patch all but the first byte */
@@ -677,14 +688,14 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- on_each_cpu(do_sync_core, NULL, 1);
+ run_sync();
}

/* patch the first byte */
ret = text_poke(addr, opcode, sizeof(int3));
BUG_ON(ret);

- on_each_cpu(do_sync_core, NULL, 1);
+ run_sync();

fail:
bp_patching_in_progress = false;

Petr Mladek

unread,
Dec 3, 2013, 8:30:02 AM12/3/13
to
The text_poke functions called BUG() in case of error. This was too strict.
There are situations when the system is still usable even when the patching
has failed, for example when enabling the dynamic ftrace.

This commit modifies text_poke, text_poke_early, and text_poke_bp functions
to return an error code instead calling BUG(). The code is returned instead
of the patched address. The address was just copied from the first parameter,
so it was no extra information. It has not been used anywhere yet.

The commit also modifies the few locations where text_poke functions were used
and the error code has to be handled now. It just passes the error code if
there already is an existing error handling, for example in
kgdb_arch_set_breakpoint. It calls BUG() in the other locations.

Note that BUG() still need to be called in text_poke_bp when the code already is
partially modified but the operation can not be finished.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/include/asm/alternative.h | 7 +++--
arch/x86/kernel/alternative.c | 59 ++++++++++++++++++++++++--------------
arch/x86/kernel/jump_label.c | 11 ++++---
arch/x86/kernel/kgdb.c | 11 +++++--
arch/x86/kernel/kprobes/core.c | 10 +++++--
arch/x86/kernel/kprobes/opt.c | 8 ++++--
6 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0a3f9c9f98d5..a23a860a208d 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -208,7 +208,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define __parainstructions_end NULL
#endif

-extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+extern int text_poke_early(void *addr, const void *opcode, size_t len);

/*
* Clear and restore the kernel write-protection flag on the local CPU.
@@ -224,8 +224,9 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
-extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern int text_poke(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
-extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+extern int text_poke_bp(void *addr, const void *opcode, size_t len,
+ void *handler);

#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index df94598ad05a..10a7ec0c66f9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -242,7 +242,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)

extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-void *text_poke_early(void *addr, const void *opcode, size_t len);
+int text_poke_early(void *addr, const void *opcode, size_t len);

/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
@@ -256,6 +256,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *a;
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
+ int err;

DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
/*
@@ -285,7 +286,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);

- text_poke_early(instr, insnbuf, a->instrlen);
+ err = text_poke_early(instr, insnbuf, a->instrlen);
+ BUG_ON(err);
}
}

@@ -295,6 +297,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
const s32 *poff;
+ int err;

mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
@@ -303,8 +306,10 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn DS segment override prefix into lock prefix */
- if (*ptr == 0x3e)
- text_poke(ptr, ((unsigned char []){0xf0}), 1);
+ if (*ptr == 0x3e) {
+ err = text_poke(ptr, ((unsigned char []){0xf0}), 1);
+ BUG_ON(err);
+ }
}
mutex_unlock(&text_mutex);
}
@@ -313,6 +318,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
const s32 *poff;
+ int err;

mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
@@ -321,8 +327,10 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn lock prefix into DS segment override prefix */
- if (*ptr == 0xf0)
- text_poke(ptr, ((unsigned char []){0x3E}), 1);
+ if (*ptr == 0xf0) {
+ err = text_poke(ptr, ((unsigned char []){0x3E}), 1);
+ BUG_ON(err);
+ }
}
mutex_unlock(&text_mutex);
}
@@ -449,6 +457,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
{
struct paravirt_patch_site *p;
char insnbuf[MAX_PATCH_LEN];
+ int err;

if (noreplace_paravirt)
return;
@@ -466,7 +475,8 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,

/* Pad the rest with nops */
add_nops(insnbuf + used, p->len - used);
- text_poke_early(p->instr, insnbuf, p->len);
+ err = text_poke_early(p->instr, insnbuf, p->len);
+ BUG_ON(err);
}
}
extern struct paravirt_patch_site __start_parainstructions[],
@@ -522,10 +532,10 @@ void __init alternative_instructions(void)
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these
- * instructions. And on the local CPU you need to be protected again NMI or MCE
- * handlers seeing an inconsistent instruction while you patch.
+ * instructions. And on the local CPU you need to protect NMI and MCE
+ * handlers against seeing an inconsistent instruction while you patch.
*/
-void *__init_or_module text_poke_early(void *addr, const void *opcode,
+int __init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
{
unsigned long flags;
@@ -535,7 +545,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
local_irq_restore(flags);
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
- return addr;
+ return 0;
}

/**
@@ -551,7 +561,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
*
* Note: Must be called under text_mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+int __kprobes text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
@@ -566,7 +576,8 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
WARN_ON(!PageReserved(pages[0]));
pages[1] = virt_to_page(addr + PAGE_SIZE);
}
- BUG_ON(!pages[0]);
+ if (unlikely(!pages[0]))
+ return -EFAULT;
local_irq_save(flags);
set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
if (pages[1])
@@ -583,7 +594,7 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
for (i = 0; i < len; i++)
BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
local_irq_restore(flags);
- return addr;
+ return 0;
}

static void do_sync_core(void *info)
@@ -634,9 +645,10 @@ int poke_int3_handler(struct pt_regs *regs)
*
* Note: must be called under text_mutex.
*/
-void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
unsigned char int3 = 0xcc;
+ int ret = 0;

bp_int3_handler = handler;
bp_int3_addr = (u8 *)addr + sizeof(int3);
@@ -648,15 +660,18 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
*/
smp_wmb();

- text_poke(addr, &int3, sizeof(int3));
+ ret = text_poke(addr, &int3, sizeof(int3));
+ if (unlikely(ret))
+ goto fail;

on_each_cpu(do_sync_core, NULL, 1);

if (len - sizeof(int3) > 0) {
/* patch all but the first byte */
- text_poke((char *)addr + sizeof(int3),
- (const char *) opcode + sizeof(int3),
- len - sizeof(int3));
+ ret = text_poke((char *)addr + sizeof(int3),
+ (const char *) opcode + sizeof(int3),
+ len - sizeof(int3));
+ BUG_ON(ret);
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
@@ -666,13 +681,15 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
}

/* patch the first byte */
- text_poke(addr, opcode, sizeof(int3));
+ ret = text_poke(addr, opcode, sizeof(int3));
+ BUG_ON(ret);

on_each_cpu(do_sync_core, NULL, 1);

+fail:
bp_patching_in_progress = false;
smp_wmb();

- return addr;
+ return ret;
}

diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 26d5a55a2736..802cda1574a4 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -38,12 +38,13 @@ static void bug_at(unsigned char *ip, int line)

static void __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
- void *(*poker)(void *, const void *, size_t),
+ int (*poker)(void *, const void *, size_t),
int init)
{
union jump_code_union code;
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+ int err;

if (type == JUMP_LABEL_ENABLE) {
if (init) {
@@ -96,10 +97,12 @@ static void __jump_label_transform(struct jump_entry *entry,
*
*/
if (poker)
- (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ err = (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
else
- text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
- (void *)entry->code + JUMP_LABEL_NOP_SIZE);
+ err = text_poke_bp((void *)entry->code, &code,
+ JUMP_LABEL_NOP_SIZE,
+ (void *)entry->code + JUMP_LABEL_NOP_SIZE);
+ BUG_ON(err);
}

void arch_jump_label_transform(struct jump_entry *entry,
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f8322960e..ce542b5fa55a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -766,8 +766,10 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
*/
if (mutex_is_locked(&text_mutex))
return -EBUSY;
- text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
- BREAK_INSTR_SIZE);
+ err = text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
if (err)
return err;
@@ -792,7 +794,10 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
*/
if (mutex_is_locked(&text_mutex))
goto knl_write;
- text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
+ err = text_poke((void *)bpt->bpt_addr, bpt->saved_instr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ goto knl_write;
err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
goto knl_write;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 79a3f9682871..f4a4d15fa129 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -409,12 +409,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)

void __kprobes arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+ int ret;
+
+ ret = text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+ BUG_ON(ret);
}

void __kprobes arch_disarm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, &p->opcode, 1);
+ int ret;
+
+ ret = text_poke(p->addr, &p->opcode, 1);
+ BUG_ON(ret);
}

void __kprobes arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 898160b42e43..1e4c062e9ea8 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -376,6 +376,7 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
u8 insn_buf[RELATIVEJUMP_SIZE];
+ int err;

list_for_each_entry_safe(op, tmp, oplist, list) {
s32 rel = (s32)((long)op->optinsn.insn -
@@ -390,8 +391,9 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)
insn_buf[0] = RELATIVEJUMP_OPCODE;
*(s32 *)(&insn_buf[1]) = rel;

- text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ err = text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
op->optinsn.insn);
+ BUG_ON(err);

list_del_init(&op->list);
}
@@ -401,12 +403,14 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)
void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
u8 insn_buf[RELATIVEJUMP_SIZE];
+ int err;

/* Set int3 to first byte for kprobes */
insn_buf[0] = BREAKPOINT_INSTRUCTION;
memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
- text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ err = text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
op->optinsn.insn);
+ BUG_ON(err);
}

/*

Petr Mladek

unread,
Dec 3, 2013, 8:30:03 AM12/3/13
to
Let's continue with replacing the existing Int3-based framework in ftrace with
the new generic function introduced by the commit fd4363fff3d9 (x86: Introduce
int3 (breakpoint)-based instruction patching)

This time use it in ftrace_replace_code that modifies all the watched function
calls.

If we use text_poke_bp in ftrace_make_nop, ftrace_make_call, ftrace_modify_call,
it would be possible to get rid of the x86-specific ftrace_replace_code
implementation and use the generic code.

This would be really lovely change. Unfortunately, the code would be slow
because syncing on each CPU is relatively expensive operation. For example,
I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt,
irqsoff, function, and nop. Every tracer has also been enabled and disabled.
With 500 cycles, I got these times with the original code:

real 16m14.390s 16m15.200s 16m19.632s
user 0m0.028s 0m0.024s 0m0.028s
sys 0m23.788s 0m23.812s 0m23.804s

It slowed down after using text_poke_bp for each record separately:

real 29m45.785s 29m47.504s 29m44.016
user 0m0.004s 0m0.004s 0m0.004s
sys 0m15.776s 0m16.088s 0m16.192s

Hence, we implemented the more complex text_poke_bp_iter. When we use it,
we get the times:

real 17m19.055s 17m29.200s 17m27.456
user 0m0.052s 0m0.056s 0m0.052s
sys 0m20.236s 0m20.068s 0m20.264s

It is slightly slower than the ftrace-specific code. Well, this is the
cost for using a generic API that can be used also in other locations.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/kernel/alternative.c | 14 +-
arch/x86/kernel/ftrace.c | 428 +++++++-----------------------------------
arch/x86/kernel/traps.c | 10 -
include/linux/ftrace.h | 6 -
4 files changed, 80 insertions(+), 378 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9f8dd4f9e852..488f06d78cf1 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -616,7 +616,7 @@ int __kprobes text_poke(void *addr, const void *opcode, size_t len)
* responsible for setting the patched code read-write. This is more effective
* only when you patch many addresses at the same time.
*/
-static int text_poke_part(void *addr, const void *opcode, size_t len)
+static int notrace text_poke_part(void *addr, const void *opcode, size_t len)
{
int ret;

@@ -665,7 +665,7 @@ static void *bp_int3_handler, *bp_int3_addr;
static size_t bp_int3_len;
static void *(*bp_int3_is_handled)(const unsigned long ip);

-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
{
/* bp_patching_in_progress */
smp_rmb();
@@ -764,7 +764,7 @@ fail:
return ret;
}

-static int text_poke_check(void *addr, const void *opcode, size_t len)
+static int notrace text_poke_check(void *addr, const void *opcode, size_t len)
{
unsigned char actual[MAX_PATCH_LEN];
int ret;
@@ -780,7 +780,7 @@ static int text_poke_check(void *addr, const void *opcode, size_t len)
return 0;
}

-static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
+static int notrace add_iter_breakpoint(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
@@ -802,7 +802,7 @@ static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
return ret;
}

-static int update_iter_code(struct text_poke_bp_iter *iterator,
+static int notrace update_iter_code(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
@@ -820,7 +820,7 @@ static int update_iter_code(struct text_poke_bp_iter *iterator,
bp_int3_len - sizeof(bp_int3));
}

-static int finish_iter_update(struct text_poke_bp_iter *iterator,
+static int notrace finish_iter_update(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
@@ -841,7 +841,7 @@ static int finish_iter_update(struct text_poke_bp_iter *iterator,
return ret;
}

-static void recover_iter(struct text_poke_bp_iter *iterator,
+static void notrace recover_iter(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5ade40e4a175..92fe8cac0802 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -127,23 +127,6 @@ static const unsigned char *ftrace_nop_replace(void)
}

static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = ftrace_check_code(ip, old_code);
-
- /* replace the text with the new text */
- if (!ret && do_ftrace_mod_code(ip, new_code))
- return -EPERM;
-
- sync_core();
-
- return 0;
-}
-
-static int
ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code)
{
@@ -168,20 +151,7 @@ int ftrace_make_nop(struct module *mod,
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();

- /*
- * On boot up, and when modules are loaded, the MCOUNT_ADDR
- * is converted to a nop, and will never become MCOUNT_ADDR
- * again. This code is either running before SMP (on boot up)
- * or before the code will ever be executed (module load).
- * We do not want to use the breakpoint version in this case,
- * just modify the code directly.
- */
- if (addr == MCOUNT_ADDR)
- return ftrace_modify_code_direct(rec->ip, old, new);
-
- /* Normal cases use add_brk_on_nop */
- WARN_ONCE(1, "invalid use of ftrace_make_nop");
- return -EINVAL;
+ return ftrace_modify_code(ip, old, new);
}

int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -192,44 +162,10 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);

- /* Should only be called when module is loaded */
- return ftrace_modify_code_direct(rec->ip, old, new);
+ return ftrace_modify_code(ip, old, new);
}

/*
- * The modifying_ftrace_code is used to tell the breakpoint
- * handler to call ftrace_int3_handler(). If it fails to
- * call this handler for a breakpoint added by ftrace, then
- * the kernel may crash.
- *
- * As atomic_writes on x86 do not need a barrier, we do not
- * need to add smp_mb()s for this to work. It is also considered
- * that we can not read the modifying_ftrace_code before
- * executing the breakpoint. That would be quite remarkable if
- * it could do that. Here's the flow that is required:
- *
- * CPU-0 CPU-1
- *
- * atomic_inc(mfc);
- * write int3s
- * <trap-int3> // implicit (r)mb
- * if (atomic_read(mfc))
- * call ftrace_int3_handler()
- *
- * Then when we are finished:
- *
- * atomic_dec(mfc);
- *
- * If we hit a breakpoint that was not set by ftrace, it does not
- * matter if ftrace_int3_handler() is called or not. It will
- * simply be ignored. But it is crucial that a ftrace nop/caller
- * breakpoint is handled. No other user should ever place a
- * breakpoint on an ftrace nop/caller location. It must only
- * be done by this code.
- */
-atomic_t modifying_ftrace_code __read_mostly;
-
-/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
* ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
@@ -267,80 +203,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
}

/*
- * A breakpoint was added to the code address we are about to
- * modify, and this is the handle that will just skip over it.
- * We are either changing a nop into a trace call, or a trace
- * call to a nop. While the change is taking place, we treat
- * it just like it was a nop.
- */
-int ftrace_int3_handler(struct pt_regs *regs)
-{
- if (WARN_ON_ONCE(!regs))
- return 0;
-
- if (!ftrace_location(regs->ip - 1))
- return 0;
-
- regs->ip += MCOUNT_INSN_SIZE - 1;
-
- return 1;
-}
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return probe_kernel_write((void *)ip, val, size);
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
-
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
-
- if (ftrace_write(ip, &brk, 1))
- return -EPERM;
-
- return 0;
-}
-
-static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned const char *old;
- unsigned long ip = rec->ip;
-
- old = ftrace_call_replace(ip, addr);
-
- return add_break(rec->ip, old);
-}
-
-
-static int add_brk_on_nop(struct dyn_ftrace *rec)
-{
- unsigned const char *old;
-
- old = ftrace_nop_replace();
-
- return add_break(rec->ip, old);
-}
-
-/*
* If the record has the FTRACE_FL_REGS set, that means that it
* wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
* is not not set, then it wants to convert to the normal callback.
@@ -366,275 +228,131 @@ static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
return (unsigned long)FTRACE_ADDR;
}

-static int add_breakpoints(struct dyn_ftrace *rec, int enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_test_record(rec, enable);
-
- ftrace_addr = get_ftrace_addr(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_brk_on_nop(rec);
-
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
- case FTRACE_UPDATE_MODIFY_CALL:
- ftrace_addr = get_ftrace_old_addr(rec);
- /* fall through */
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_brk_on_call(rec, ftrace_addr);
- }
- return 0;
-}
+struct ftrace_tp_iter {
+ struct ftrace_rec_iter *rec_iter;
+ struct dyn_ftrace *rec;
+ int enable;
+};

-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
+static struct ftrace_tp_iter *ftrace_tp_set_record(struct ftrace_tp_iter *tp_iter)
{
- unsigned char ins[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
- const unsigned char *nop;
- unsigned long ftrace_addr;
- unsigned long ip = rec->ip;
-
- /* If we fail the read, just give up */
- if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ if (!tp_iter->rec_iter)
+ return NULL;

- /* If this does not have a breakpoint, we are done */
- if (ins[0] != brk)
- return -1;
-
- nop = ftrace_nop_replace();
-
- /*
- * If the last 4 bytes of the instruction do not match
- * a nop, then we assume that this is a call to ftrace_addr.
- */
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
- /*
- * For extra paranoidism, we check if the breakpoint is on
- * a call that would actually jump to the ftrace_addr.
- * If not, don't touch the breakpoint, we make just create
- * a disaster.
- */
- ftrace_addr = get_ftrace_addr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
- goto update;
-
- /* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = get_ftrace_old_addr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
- return -EINVAL;
- }
-
- update:
- return probe_kernel_write((void *)ip, &nop[0], 1);
+ tp_iter->rec = ftrace_rec_iter_record(tp_iter->rec_iter);
+ return tp_iter;
}

-static int add_update_code(unsigned long ip, unsigned const char *new)
+void *ftrace_tp_iter_start(void *init)
{
- /* skip breakpoint */
- ip++;
- new++;
- if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
- return -EPERM;
- return 0;
+ struct ftrace_tp_iter *tp_iter = init;
+
+ tp_iter->rec_iter = ftrace_rec_iter_start();
+ return ftrace_tp_set_record(tp_iter);
}

-static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
+void *ftrace_tp_iter_next(void *cur)
{
- unsigned long ip = rec->ip;
- unsigned const char *new;
+ struct ftrace_tp_iter *tp_iter = cur;

- new = ftrace_call_replace(ip, addr);
- return add_update_code(ip, new);
+ tp_iter->rec_iter = ftrace_rec_iter_next(tp_iter->rec_iter);
+ return ftrace_tp_set_record(tp_iter);
}

-static int add_update_nop(struct dyn_ftrace *rec)
+void *ftrace_tp_iter_get_addr(void *cur)
{
- unsigned long ip = rec->ip;
- unsigned const char *new;
+ struct ftrace_tp_iter *tp_iter = cur;

- new = ftrace_nop_replace();
- return add_update_code(ip, new);
+ return (void *)(tp_iter->rec->ip);
}

-static int add_update(struct dyn_ftrace *rec, int enable)
+const void *ftrace_tp_iter_get_opcode(void *cur)
{
- unsigned long ftrace_addr;
+ struct ftrace_tp_iter *tp_iter = cur;
+ unsigned long addr;
int ret;

- ret = ftrace_test_record(rec, enable);
-
- ftrace_addr = get_ftrace_addr(rec);
+ ret = ftrace_test_record(tp_iter->rec, tp_iter->enable);

switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
+ case FTRACE_UPDATE_MAKE_NOP:
+ return ftrace_nop_replace();

- case FTRACE_UPDATE_MODIFY_CALL_REGS:
- case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_update_call(rec, ftrace_addr);
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ addr = get_ftrace_addr(tp_iter->rec);
+ return ftrace_call_replace(tp_iter->rec->ip, addr);

- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_update_nop(rec);
+ case FTRACE_UPDATE_IGNORE:
+ default: /* unknown ftrace bug */
+ return NULL;
}
-
- return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
-
- if (ftrace_write(ip, new, 1))
- return -EPERM;
-
- return 0;
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
-
- if (ftrace_write(ip, new, 1))
- return -EPERM;
- return 0;
}

-static int finish_update(struct dyn_ftrace *rec, int enable)
+const void *ftrace_tp_iter_get_old_opcode(void *cur)
{
- unsigned long ftrace_addr;
+ struct ftrace_tp_iter *tp_iter = cur;
+ unsigned long old_addr;
int ret;

- ret = ftrace_update_record(rec, enable);
-
- ftrace_addr = get_ftrace_addr(rec);
+ ret = ftrace_test_record(tp_iter->rec, tp_iter->enable);

switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
- case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return finish_update_call(rec, ftrace_addr);
+ return ftrace_nop_replace();

case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return finish_update_nop(rec);
- }
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ old_addr = get_ftrace_old_addr(tp_iter->rec);
+ return ftrace_call_replace(tp_iter->rec->ip, old_addr);

- return 0;
+ case FTRACE_UPDATE_IGNORE:
+ default: /* unknown ftrace bug */
+ return NULL;
+ }
}

-static void do_sync_core(void *data)
+int ftrace_tp_iter_finish(void *cur)
{
- sync_core();
-}
+ struct ftrace_tp_iter *tp_iter = cur;

-static void run_sync(void)
-{
- int enable_irqs = irqs_disabled();
-
- /* We may be called with interrupts disbled (on bootup). */
- if (enable_irqs)
- local_irq_enable();
- on_each_cpu(do_sync_core, NULL, 1);
- if (enable_irqs)
- local_irq_disable();
+ ftrace_update_record(tp_iter->rec, tp_iter->enable);
+ return 0;
}

void ftrace_replace_code(int enable)
{
- struct ftrace_rec_iter *iter;
- struct dyn_ftrace *rec;
- const char *report = "adding breakpoints";
- int count = 0;
+ struct text_poke_bp_iter tp_bp_iter;
+ struct ftrace_tp_iter tp_iter;
int ret;

- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
-
- ret = add_breakpoints(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
-
- report = "updating code";
-
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
-
- ret = add_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- }
+ tp_iter.enable = enable;

- run_sync();
+ tp_bp_iter.init = (void *)&tp_iter;
+ tp_bp_iter.len = MCOUNT_INSN_SIZE;
+ tp_bp_iter.start = ftrace_tp_iter_start;
+ tp_bp_iter.next = ftrace_tp_iter_next;
+ tp_bp_iter.get_addr = ftrace_tp_iter_get_addr;
+ tp_bp_iter.get_opcode = ftrace_tp_iter_get_opcode;
+ tp_bp_iter.get_old_opcode = ftrace_tp_iter_get_old_opcode;
+ tp_bp_iter.finish = ftrace_tp_iter_finish;
+ tp_bp_iter.is_handled = (void *(*)(const unsigned long))ftrace_location;

- report = "removing breakpoints";
+ ret = text_poke_bp_list(&tp_bp_iter);

- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
-
- ret = finish_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- }
-
- run_sync();
-
- return;
-
- remove_breakpoints:
- ftrace_bug(ret, rec ? rec->ip : 0);
- printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
- remove_breakpoint(rec);
- }
+ if (ret)
+ ftrace_bug(ret, (unsigned long)(tp_bp_iter.fail_addr));
}

+/*
+ * The code is modified using Int3 guard,
+ * so we do not need to call the stop machine
+ */
void arch_ftrace_update_code(int command)
{
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ftrace_modify_all_code(command);
-
- atomic_dec(&modifying_ftrace_code);
}

int __init ftrace_dyn_arch_init(void *data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b857ed890b4c..69ad8cf8b7f0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -50,7 +50,6 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
-#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
@@ -319,15 +318,6 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
{
enum ctx_state prev_state;

-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * ftrace must be first, everything else may cause a recursive crash.
- * See note by declaration of modifying_ftrace_code in ftrace.c
- */
- if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
-#endif
if (poke_int3_handler(regs))
return;

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 31ea4b428360..fe50347cf18e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -383,12 +383,6 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void);
struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter);
struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);

-#define for_ftrace_rec_iter(iter) \
- for (iter = ftrace_rec_iter_start(); \
- iter; \
- iter = ftrace_rec_iter_next(iter))
-
-
int ftrace_update_record(struct dyn_ftrace *rec, int enable);
int ftrace_test_record(struct dyn_ftrace *rec, int enable);
void ftrace_run_stop_machine(int command);

Petr Mladek

unread,
Dec 3, 2013, 8:30:03 AM12/3/13
to
One more change related to replacing the existing Int3-based framework with
the new generic function introduced by the commit fd4363fff3d9
(x86: Introduce int3 (breakpoint)-based instruction patching)

ftrace_enable_ftrace_graph_caller and ftrace_disable_ftrace_graph_caller
modified the jump target directly without using the int3 guard. It worked
because writing the address was an atomic operation.

We do not really need to use the safe ftrace_modify_code here but it helps
to remove another arch-specific code. The result is more consistent
and better readable code which is might be worth the change.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/kernel/ftrace.c | 63 +++++++++++-------------------------------------
1 file changed, 14 insertions(+), 49 deletions(-)

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 92fe8cac0802..20d20289f402 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -47,7 +47,7 @@ int ftrace_arch_code_modify_post_process(void)
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
struct {
- char e8;
+ char inst;
int offset;
} __attribute__((packed));
};
@@ -88,7 +88,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;

- calc.e8 = 0xe8;
+ calc.inst = 0xe8;
calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);

/*
@@ -98,27 +98,11 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}

-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
+static void ftrace_jump_replace(union ftrace_code_union *calc,
+ unsigned long ip, unsigned long addr)
{
- return addr >= start && addr < end;
-}
-
-static int
-do_ftrace_mod_code(unsigned long ip, const void *new_code)
-{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ calc->inst = 0xe9;
+ calc->offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
}

static const unsigned char *ftrace_nop_replace(void)
@@ -369,45 +353,26 @@ int __init ftrace_dyn_arch_init(void *data)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);

-static int ftrace_mod_jmp(unsigned long ip,
- int old_offset, int new_offset)
-{
- unsigned char code[MCOUNT_INSN_SIZE];
-
- if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
- return -EINVAL;
-
- *(int *)(&code[1]) = new_offset;
-
- if (do_ftrace_mod_code(ip, &code))
- return -EPERM;
-
- return 0;
-}
-
int ftrace_enable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
+ union ftrace_code_union old, new;

- old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+ ftrace_jump_replace(&old, ip, (unsigned long)(&ftrace_stub));
+ ftrace_jump_replace(&new, ip, (unsigned long)(&ftrace_graph_caller));

- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_modify_code(ip, old.code, new.code);
}

int ftrace_disable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
+ union ftrace_code_union old, new;

- old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+ ftrace_jump_replace(&old, ip, (unsigned long)(&ftrace_graph_caller));
+ ftrace_jump_replace(&new, ip, (unsigned long)(&ftrace_stub));

- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_modify_code(ip, old.code, new.code);
}

#endif /* !CONFIG_DYNAMIC_FTRACE */

Petr Mladek

unread,
Dec 3, 2013, 8:30:02 AM12/3/13
to
probe_kernel_read is used when modifying function calls in ftrace_replace_code,
see arch/x86/kernel/ftrace.c. On x86, the code is replaced using int3 guard.
All functions are patched in parallel to reduce an expensive synchronization
of all CPUs. The result is that all affected functions are called via
ftrace_int3_handler during the process.

ftrace_int3_handler is relatively slow because it has to check whether
it is responsible for the handled IP. Therefore we should not modify
functions that used during patching. It would slowdown the patching.

I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt,
irqsoff, function, and nop. Every tracer has also been enabled and disabled.
With 500 cycles, I got these times before this commit:

real 18m2.477s 18m8.654s 18m9.196s
user 0m0.008s 0m0.008s 0m0.012s
sys 0m17.316s 0m17.104s 0m17.300s

and after this commit:

real 16m14.390s 16m15.200s 16m19.632s
user 0m0.028s 0m0.024s 0m0.028s
sys 0m23.788s 0m23.812s 0m23.804s

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
mm/maccess.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/maccess.c b/mm/maccess.c
index d53adf9ba84b..bed9ee854ea0 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -18,7 +18,7 @@
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));

-long __probe_kernel_read(void *dst, const void *src, size_t size)
+long notrace __probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();

Petr Mladek

unread,
Dec 3, 2013, 8:30:02 AM12/3/13
to
The commit fd4363fff3d9 (x86: Introduce int3 (breakpoint)-based instruction
patching) uses the same technique that has been used in ftrace since 08d636b
("ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine")

When trying to use text_poke_bp in dynamic ftrace, the process was significantly
slower than the original code. For example, I tried to switch between 7 tracers:
blk, branch, function_graph, wakeup_rt, irqsoff, function, and nop. Every tracer
has also been enabled and disabled. With 500 cycles, I got these times with the
original code:

real 16m14.390s 16m15.200s 16m19.632s
user 0m0.028s 0m0.024s 0m0.028s
sys 0m23.788s 0m23.812s 0m23.804s

and with text_poke_bp:

real 29m45.785s 29m47.504s 29m44.016
user 0m0.004s 0m0.004s 0m0.004s
sys 0m15.776s 0m16.088s 0m16.192s

It turned out that most of the extra time was spent in the call:

on_each_cpu(do_sync_core, NULL, 1)

It is needed to reset the speculative queue of decoded instructions on each CPU.
It is relatively expensive operation. The original code reduced the usage by
patching all instructions in parallel.

This commits adds text_poke_bp_iter that provides generic interface for patching
more instructions in parallel. It is basically a mix of text_poke_bp and
ftrace_replace_code. The list of addresses and opcodes is passed via
an iterator and callbacks.

We need to iterate several times but we do not need both opcodes and the address
in all stages. They are passed via callback to avoid an unnecessary computation.

The function checks the current code before patching. It might be possible to
leave this check in the ftrace code but such paranoia is useful in general.
It helps to keep the system consistent. And the old opcode is needed anyway.
When something goes wrong, it is used to get back to a valid state. Note
that an error might happen even when adding interrupt to a particular address.

The optional "finish" callback can be used to update state of the patched entry.
For example, it is needed to update the related ftrace record.

Some opcodes might already be in the final state and do not need patching.
It would be possible to detect this by comparing the old and new opcode but
there might be more a effective way. For example, ftrace could tell this by
ftrace_test_record. We pass this information via the get_opcode callbacks.
They return a valid opcode when a change is needed and NULL otherwise. Note
that we should call the "finish" callback even then the code is not really
modified.

Finally, ftrace wants to know more information about a potential failure.
The error code is passed via the return value. The failing address and
the number of the failed record is passed via the iterator structure.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/include/asm/alternative.h | 36 ++++++
arch/x86/kernel/alternative.c | 256 +++++++++++++++++++++++++++++++++++--
2 files changed, 279 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index a23a860a208d..400b0059570a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -229,4 +229,40 @@ extern int poke_int3_handler(struct pt_regs *regs);
extern int text_poke_bp(void *addr, const void *opcode, size_t len,
void *handler);

+struct text_poke_bp_iter {
+ /* information used to start iteration from the beginning */
+ void *init;
+ /* length of the patched instruction */
+ size_t len;
+ /* details about failure if any */
+ int fail_count;
+ void *fail_addr;
+ /* iteration over entries */
+ void *(*start)(void *init);
+ void *(*next)(void *prev);
+ /* callback to get patched address */
+ void *(*get_addr)(void *cur);
+ /*
+ * Callbacks to get the patched code. They could return NULL if no
+ * patching is needed; This is useful for example in ftrace.
+ */
+ const void *(*get_opcode)(void *cur);
+ const void *(*get_old_opcode)(void *cur);
+ /*
+ * Optional function that is called when the patching of the given
+ * has finished. It might be NULL if no postprocess is needed.
+ */
+ int (*finish)(void *cur);
+ /*
+ * Helper function for int3 handler. It decides whether the given IP
+ * is being patched or not.
+ *
+ * Try to implement it as fast as possible. It affects performance
+ * of the system when the patching is in progress.
+ */
+ void *(*is_handled)(const unsigned long ip);
+};
+
+extern int text_poke_bp_list(struct text_poke_bp_iter *iter);
+
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2e47107b9055..2f6088282724 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
#include <linux/stringify.h>
#include <linux/kprobes.h>
#include <linux/mm.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/stop_machine.h>
@@ -613,8 +614,11 @@ static void run_sync(void)
on_each_cpu(do_sync_core, NULL, 1);
}

+static char bp_int3;
static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;
+static size_t bp_int3_len;
+static void *(*bp_int3_is_handled)(const unsigned long ip);

int poke_int3_handler(struct pt_regs *regs)
{
@@ -624,14 +628,23 @@ int poke_int3_handler(struct pt_regs *regs)
if (likely(!bp_patching_in_progress))
return 0;

- if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ if (user_mode_vm(regs))
return 0;

- /* set up the specified breakpoint handler */
- regs->ip = (unsigned long) bp_int3_handler;
+ /* Check if address is handled by text_poke_bp */
+ if (bp_int3_handler && regs->ip == (unsigned long)bp_int3_addr) {
+ regs->ip = (unsigned long)bp_int3_handler;
+ return 1;
+ }

- return 1;
+ /* Check if address is handled by text_poke_bp_list */
+ if (bp_int3_is_handled && bp_int3_is_handled(regs->ip)) {
+ /* just skip the instruction */
+ regs->ip += bp_int3_len - 1;
+ return 1;
+ }

+ return 0;
}

/**
@@ -658,11 +671,13 @@ int poke_int3_handler(struct pt_regs *regs)
*/
int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
- unsigned char int3 = 0xcc;
- int ret = 0;
+ int ret;

+ bp_int3 = 0xcc;
bp_int3_handler = handler;
- bp_int3_addr = (u8 *)addr + sizeof(int3);
+ bp_int3_addr = (u8 *)addr + sizeof(bp_int3);
+ bp_int3_len = len;
+ bp_int3_is_handled = NULL;
bp_patching_in_progress = true;
/*
* Corresponding read barrier in int3 notifier for
@@ -671,17 +686,17 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
*/
smp_wmb();

- ret = text_poke(addr, &int3, sizeof(int3));
+ ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
if (unlikely(ret))
goto fail;

run_sync();

- if (len - sizeof(int3) > 0) {
+ if (len - sizeof(bp_int3) > 0) {
/* patch all but the first byte */
- ret = text_poke((char *)addr + sizeof(int3),
- (const char *) opcode + sizeof(int3),
- len - sizeof(int3));
+ ret = text_poke((char *)addr + sizeof(bp_int3),
+ (const char *) opcode + sizeof(bp_int3),
+ len - sizeof(bp_int3));
BUG_ON(ret);
/*
* According to Intel, this core syncing is very likely
@@ -692,7 +707,7 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
}

/* patch the first byte */
- ret = text_poke(addr, opcode, sizeof(int3));
+ ret = text_poke(addr, opcode, sizeof(bp_int3));
BUG_ON(ret);

run_sync();
@@ -704,3 +719,218 @@ fail:
return ret;
}

+static int text_poke_check(void *addr, const void *opcode, size_t len)
+{
+ unsigned char actual[MAX_PATCH_LEN];
+ int ret;
+
+ ret = probe_kernel_read(actual, addr, len);
+ if (unlikely(ret))
+ return -EFAULT;
+
+ ret = memcmp(actual, opcode, len);
+ if (unlikely(ret))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
+ void *iter)
+{
+ void *addr;
+ const void *old_opcode;
+ int ret = 0;
+
+ /* nope if the code is not defined */
+ old_opcode = iterator->get_old_opcode(iter);
+ if (!old_opcode)
+ return 0;
+
+ addr = iterator->get_addr(iter);
+ ret = text_poke_check(addr, old_opcode, bp_int3_len);
+
+ if (likely(!ret))
+ /* write the breakpoint */
+ ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
+
+ return ret;
+}
+
+static int update_iter_code(struct text_poke_bp_iter *iterator,
+ void *iter)
+{
+ void *addr;
+ const void *opcode;
+
+ /* nope if the code is not defined */
+ opcode = iterator->get_opcode(iter);
+ if (!opcode)
+ return 0;
+
+ /* write all but the first byte */
+ addr = iterator->get_addr(iter);
+ return text_poke(addr + sizeof(bp_int3),
+ opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
+}
+
+static int finish_iter_update(struct text_poke_bp_iter *iterator,
+ void *iter)
+{
+ void *addr;
+ const void *opcode;
+ int ret = 0;
+
+ opcode = iterator->get_opcode(iter);
+ if (opcode) {
+ /* write the first byte if defined */
+ addr = iterator->get_addr(iter);
+ ret = text_poke(addr, opcode, sizeof(bp_int3));
+ }
+
+ /* Patching has finished. Let's update the status flag if any. */
+ if (!ret && iterator->finish)
+ ret = iterator->finish(iter);
+
+ return ret;
+}
+
+static void recover_iter(struct text_poke_bp_iter *iterator,
+ void *iter)
+{
+ void *addr;
+ const void *old_opcode;
+ unsigned char actual[MAX_PATCH_LEN];
+ int err;
+
+ /* If opcode is not defined, we did not change this address at all */
+ old_opcode = iterator->get_old_opcode(iter);
+ if(!old_opcode)
+ return;
+
+ addr = iterator->get_addr(iter);
+ err = probe_kernel_read(actual, addr, bp_int3_len);
+ /* There is no way to recover the system if we even can not read */
+ BUG_ON(err);
+
+ /* We are done if there is no interrupt */
+ if (actual[0] != bp_int3)
+ return;
+
+ /* Put the old code back behind the interrupt if it is not there */
+ if (memcmp(actual + sizeof(bp_int3),
+ old_opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3)) != 0) {
+ err = text_poke(addr + sizeof(bp_int3),
+ old_opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
+ /* we should not continue if recovering has failed */
+ BUG_ON(err);
+ /*
+ * It is not optimal to sync for each repaired address.
+ * But this is a corner case and we are in bad state anyway.
+ * Note that this is not needed on Intel, see text_poke_bp
+ */
+ run_sync();
+ }
+
+ /* Finally, put back the first byte from the old code */
+ err = text_poke(addr, old_opcode, sizeof(bp_int3));
+ /* we can not continue if the interrupt is still there */
+ BUG_ON(err);
+}
+
+#define for_text_poke_bp_iter(iterator,iter) \
+ for (iter = iterator->start(iterator->init); \
+ iter; \
+ iter = iterator->next(iter))
+
+/**
+ * text_poke_bp_list() -- update instructions on live kernel on SMP
+ * @iterator: structure that allows to iterate over the patched addressed
+ * and get all the needed information
+ *
+ * Modify several instructions by using int3 breakpoint on SMP in parallel.
+ * The principle is the same as in text_poke_bp. But synchronization of all CPUs
+ * is relatively slow operation, so we need to reduce it. Hence we add interrupt
+ * for all instructions before we modify the other bytes, etc.
+ *
+ * The operation wants to keep a consistent state. If anything goes wrong,
+ * it does the best effort to restore the original state. The only exception
+ * are addresses where the patching has finished. But the caller can be informed
+ * about this via the finish callback.
+ *
+ * It is a bit more paranoid than text_poke_bp because it checks the actual
+ * code before patching. This is a good practice proposed by the ftrace code.
+ *
+ * Note: This function must be called under text_mutex.
+ */
+int text_poke_bp_list(struct text_poke_bp_iter *iterator)
+{
+ void *iter;
+ const char *report = "adding breakpoints";
+ int count = 0;
+ int ret = 0;
+
+ bp_int3 = 0xcc;
+ bp_int3_addr = NULL;
+ bp_int3_len = iterator->len;
+ bp_int3_handler = NULL;
+ bp_int3_is_handled = iterator->is_handled;
+ bp_patching_in_progress = true;
+
+ smp_wmb();
+
+ /* add interrupts */
+ for_text_poke_bp_iter(iterator, iter) {
+ ret = add_iter_breakpoint(iterator, iter);
+ if (ret)
+ goto fail;
+ count++;
+ }
+ run_sync();
+
+ report = "updating code";
+ if (bp_int3_len - sizeof(bp_int3) > 0) {
+ count = 0;
+ /* patch all but the first byte */
+ for_text_poke_bp_iter(iterator, iter) {
+ ret = update_iter_code(iterator, iter);
+ if (ret)
+ goto fail;
+ count++;
+ }
+ run_sync();
+ }
+
+ /* patch the first byte */
+ report = "finishing update";
+ count = 0;
+ for_text_poke_bp_iter(iterator, iter) {
+ ret = finish_iter_update(iterator, iter);
+ if (ret)
+ goto fail;
+ count++;
+ }
+ run_sync();
+
+fail:
+ if (unlikely(ret)) {
+ printk(KERN_WARNING "text_poke_bp_iter failed on %s (%d):\n",
+ report, count);
+ /* save information about the failure */
+ iterator->fail_count = count;
+ iterator->fail_addr = iterator->get_addr(iter);
+ /* try to recover the old state */
+ for_text_poke_bp_iter(iterator, iter) {
+ recover_iter(iterator, iter);
+ }
+ run_sync();
+ }
+
+ bp_patching_in_progress = false;
+ smp_wmb();
+
+ return ret;
+}

Petr Mladek

unread,
Dec 3, 2013, 8:30:02 AM12/3/13
to
Hi,

I resend this patchset as there was no comment on v4. The only change is
that I have rebased this on top of today's kernel/git/tip/tip.git.

The commit fd4363fff3d9 (x86: Introduce int3 (breakpoint)-based instruction
patching) uses the same technique that has been used in ftrace since 08d636b
("ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine")

This patch set merge the two implementations and remove duplicities from
the ftrace side.

+ First four patches improve the generic int3-based framework to be usable
with ftrace. All the changes are based on ideas already used in ftrace.
They are needed to keep the functionality and efficiency.

+ The 5th patch speedups the original ftrace code but it is useful also
with the generic functions.

+ The last three patches modifies different parts of the current
x86-specific ftrace implementation and use the generic functions there.

Changes in v5:

+ rebased on top of current kernel/git/tip/tip.git

Changes in v4:

+ reverted one change in text_poke in the first patch as suggested by
Masami Hiramatsu; it means to still call BUG_ON() when the error is
unclear

Changes in v3:

+ use the new text_poke_part only in text_poke_bp_list because it
works only with read-write code
+ update all text_poke functions to return error instead on calling BUG()
+ handle text_poke* function return values whenever they were used

Changes in v2:

+ check for number of CPUs instead of enabling IRQs when syncing CPUs;
suggested by Steven Rostedt, Paul E. McKenney, and Masami Hiramatsu
+ return error codes in text_poke_part and text_poke_bp; needed by ftrace
+ reverted changes in text_poke_bp; it patches only single address again
+ introduce text_poke_bp_iter for patching multiple addresses:
+ uses iterator and callbacks instead of copying data
+ checks old code before patching
+ returns error code and more info about error; needed by ftrace
+ provides recovery mechanism in case of errors
+ update ftrace.c to use the new text_poke_bp_iter
+ split notrace __probe_kernel_read into separate patch because it
is useful for original ftrace code as well
+ rebased on current kernel tip and updated performance statistics;
it started to be slower on idle machine after the commit commit
c229828ca6bc62d6c654 (rcu: Throttle rcu_try_advance_all_cbs() execution)

I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt,
irqsoff, function, and nop. Every tracer has also been enabled and disabled.
With 500 cycles, I got these times before the change:

real 18m2.477s 18m8.654s 18m9.196s
user 0m0.008s 0m0.008s 0m0.012s
sys 0m17.316s 0m17.104s 0m17.300s

and after

real 17m19.055s 17m29.200s 17m27.456
user 0m0.052s 0m0.056s 0m0.052s
sys 0m20.236s 0m20.068s 0m20.264s

The patches are against kernel/git/tip/tip.git on top of the commit
c22638a1d22e79c "Merge branch 'tools/kvm'"

Petr Mladek (8):
x86: allow to handle errors in text_poke function family
x86: allow to call text_poke_bp during boot
x86: add generic function to modify more calls using int3 framework
x86: speed up int3-based patching using direct write
x86: do not trace __probe_kernel_read
x86: modify ftrace function using the new int3-based framework
x86: patch all traced function calls using the int3-based framework
x86: enable/disable ftrace graph call using new int3-based framework

arch/x86/include/asm/alternative.h | 43 ++-
arch/x86/kernel/alternative.c | 369 +++++++++++++++++++++--
arch/x86/kernel/ftrace.c | 583 ++++++++-----------------------------
arch/x86/kernel/jump_label.c | 11 +-
arch/x86/kernel/kgdb.c | 11 +-
arch/x86/kernel/kprobes/core.c | 10 +-
arch/x86/kernel/kprobes/opt.c | 8 +-
arch/x86/kernel/traps.c | 10 -
include/linux/ftrace.h | 6 -
mm/maccess.c | 2 +-
10 files changed, 527 insertions(+), 526 deletions(-)

Petr Mladek

unread,
Dec 3, 2013, 8:30:03 AM12/3/13
to
When trying to use text_poke_bp_iter in ftrace, the result was slower than
the original implementation.

It turned out that one difference was in text_poke vs. ftrace_write.
text_poke did many extra operations to make sure that code was read-write
and the change was atomic.

In fact, we do not need the atomic operation inside text_poke_bp_iter because
the modified code is guarded by the int3 handler. The int3 interrupt itself
is added and removed by an atomic operation because we modify only one byte.

Also if we patch many functions, it is more effective to set the whole text code
read-write instead of remapping each address into a helper address space.

This patch adds text_poke_part which is inspired by ftrace_write.
Then it is used in text_poke_bp_iter instead of the slower text_poke.
It adds the limitation that text_poke_bp_iter user is responsible for
setting the patched code read-write.

Note that we can't use it in text_poke because it is not effective
to set the page or two read-write just because patching one piece.

For example, I tried to switch between 7 tracers: blk, branch, function_graph,
wakeup_rt, irqsoff, function, and nop. Every tracer has also been enabled and
disabled. With 100 cycles, I got these times with text_poke:

real 25m7.380s 25m13.44s 25m9.072s
user 0m0.004s 0m0.004s 0m0.004s
sys 0m3.480s 0m3.508s 0m3.420s

and with text_poke_part:

real 3m23.335s 3m24.422s 3m26.686s
user 0m0.004s 0m0.004s 0m0.004s
sys 0m3.724s 0m3.772s 0m3.588s

Signed-off-by: Petr Mladek <pml...@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hi...@hitachi.com>
---
arch/x86/kernel/alternative.c | 67 ++++++++++++++++++++++++++++++++++++-------
1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2f6088282724..9f8dd4f9e852 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -598,6 +598,51 @@ int __kprobes text_poke(void *addr, const void *opcode, size_t len)
return 0;
}

+/**
+ * text_poke_part - Update part of the instruction on a live kernel when using
+ * int3 breakpoint
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * If we patch instructions using int3 interrupt, we do not need to be so
+ * careful about an atomic write. Adding and removing the interrupt is atomic
+ * because we modify only one byte. The rest of the instruction could be
+ * modified in several steps because it is guarded by the interrupt handler.
+ * Hence we could use faster code here. See also text_poke_bp_iter below
+ * for more details.
+ *
+ * Note: This function must be called under text_mutex. Also the caller is
+ * responsible for setting the patched code read-write. This is more effective
+ * only when you patch many addresses at the same time.
+ */
+static int text_poke_part(void *addr, const void *opcode, size_t len)
+{
+ int ret;
+
+ /* The patching makes sense only for a text code */
+ if (unlikely((unsigned long)addr < (unsigned long)_text) ||
+ unlikely((unsigned long)addr >= (unsigned long)_etext))
+ return -EFAULT;
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+ /*
+ * On x86_64, the kernel text mapping is always mapped read-only with
+ * CONFIG_DEBUG_RODATA. Instead, we need to use the kernel identity
+ * mapping that can be set read-write, for example using
+ * set_kernel_text_rw().
+ */
+ addr = __va(__pa_symbol(addr));
+#endif
+
+ ret = probe_kernel_write(addr, opcode, len);
+ if (unlikely(ret))
+ return -EPERM;
+
+ sync_core();
+ return 0;
+}
+
static void do_sync_core(void *info)
{
sync_core();
@@ -752,7 +797,7 @@ static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,

if (likely(!ret))
/* write the breakpoint */
- ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
+ ret = text_poke_part(addr, &bp_int3, sizeof(bp_int3));

return ret;
}
@@ -770,9 +815,9 @@ static int update_iter_code(struct text_poke_bp_iter *iterator,

/* write all but the first byte */
addr = iterator->get_addr(iter);
- return text_poke(addr + sizeof(bp_int3),
- opcode + sizeof(bp_int3),
- bp_int3_len - sizeof(bp_int3));
+ return text_poke_part(addr + sizeof(bp_int3),
+ opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
}

static int finish_iter_update(struct text_poke_bp_iter *iterator,
@@ -786,7 +831,7 @@ static int finish_iter_update(struct text_poke_bp_iter *iterator,
if (opcode) {
/* write the first byte if defined */
addr = iterator->get_addr(iter);
- ret = text_poke(addr, opcode, sizeof(bp_int3));
+ ret = text_poke_part(addr, opcode, sizeof(bp_int3));
}

/* Patching has finished. Let's update the status flag if any. */
@@ -822,9 +867,9 @@ static void recover_iter(struct text_poke_bp_iter *iterator,
if (memcmp(actual + sizeof(bp_int3),
old_opcode + sizeof(bp_int3),
bp_int3_len - sizeof(bp_int3)) != 0) {
- err = text_poke(addr + sizeof(bp_int3),
- old_opcode + sizeof(bp_int3),
- bp_int3_len - sizeof(bp_int3));
+ err = text_poke_part(addr + sizeof(bp_int3),
+ old_opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
/* we should not continue if recovering has failed */
BUG_ON(err);
/*
@@ -836,7 +881,7 @@ static void recover_iter(struct text_poke_bp_iter *iterator,
}

/* Finally, put back the first byte from the old code */
- err = text_poke(addr, old_opcode, sizeof(bp_int3));
+ err = text_poke_part(addr, old_opcode, sizeof(bp_int3));
/* we can not continue if the interrupt is still there */
BUG_ON(err);
}
@@ -864,7 +909,9 @@ static void recover_iter(struct text_poke_bp_iter *iterator,
* It is a bit more paranoid than text_poke_bp because it checks the actual
* code before patching. This is a good practice proposed by the ftrace code.
*
- * Note: This function must be called under text_mutex.
+ * Note: This function must be called under text_mutex. Also the caller is
+ * responsible for making the patched code read-write, for example using
+ * set_kernel_text_rw() and set_all_modules_text_rw()
*/
int text_poke_bp_list(struct text_poke_bp_iter *iterator)
{

Masami Hiramatsu

unread,
Dec 6, 2013, 8:30:01 PM12/6/13
to
(2013/12/03 22:21), Petr Mladek wrote:
> The text_poke functions called BUG() in case of error. This was too strict.
> There are situations when the system is still usable even when the patching
> has failed, for example when enabling the dynamic ftrace.
>
> This commit modifies text_poke, text_poke_early, and text_poke_bp functions
> to return an error code instead calling BUG(). The code is returned instead
> of the patched address. The address was just copied from the first parameter,
> so it was no extra information. It has not been used anywhere yet.

Hmm, this change basically good for me. However, from the maintenance point
of view, I'd like to recommend you to introduce some wrappers for them
to check return code and just do BUG() instead of changing all call-site,
because except for the text_poke_bp, we can not rollback the code safely.
(e.g. text_poke() returns an error but text_poke_or_die() just calls BUG
when it fails)

Thank you,

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hi...@hitachi.com

Petr Mladek

unread,
Dec 10, 2013, 10:50:01 AM12/10/13
to
One more change related to replacing the existing Int3-based framework with
the new generic function introduced by the commit fd4363fff3d9
(x86: Introduce int3 (breakpoint)-based instruction patching)

ftrace_enable_ftrace_graph_caller and ftrace_disable_ftrace_graph_caller
modified the jump target directly without using the int3 guard. It worked
because writing the address was an atomic operation.

We do not really need to use the safe ftrace_modify_code here but it helps
to remove another arch-specific code. The result is more consistent
and better readable code which is might be worth the change.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---

Petr Mladek

unread,
Dec 10, 2013, 10:50:01 AM12/10/13
to
We would like to use text_poke_bp in ftrace. It might be called also during
boot when there is only one CPU and we do not need to sync the others.

The check is must to have because there are also disabled interrupts during
the boot. Then the call would cause a deadlock, see the warning in
"smp_call_function_many", kernel/smp.c:371.

The change is inspired by the code in arch/x86/kernel/ftrace.c.

Signed-off-by: Petr Mladek <pml...@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hi...@hitachi.com>
---
arch/x86/kernel/alternative.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index eabed9326d2a..6436beec7b0c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -599,6 +599,17 @@ static void do_sync_core(void *info)
sync_core();
}

+static void run_sync(void)
+{
+ /*
+ * We do not need to sync other cores during boot when there is only one
+ * CPU enabled. In fact, we must not because there are also disabled
+ * interrupts. The call would fail because of a potential deadlock.
+ */
+ if (num_online_cpus() != 1)
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;

@@ -661,7 +672,7 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
if (unlikely(ret))
goto fail;

- on_each_cpu(do_sync_core, NULL, 1);
+ run_sync();

if (len - sizeof(int3) > 0) {
/*
@@ -676,13 +687,13 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- on_each_cpu(do_sync_core, NULL, 1);
+ run_sync();
}

/* Patch the first byte. We do not know how to recover from an error. */
text_poke_or_die(addr, opcode, sizeof(int3));

- on_each_cpu(do_sync_core, NULL, 1);
+ run_sync();

fail:
bp_patching_in_progress = false;

Petr Mladek

unread,
Dec 10, 2013, 10:50:01 AM12/10/13
to
Signed-off-by: Petr Mladek <pml...@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hi...@hitachi.com>
---
arch/x86/kernel/alternative.c | 67 ++++++++++++++++++++++++++++++++++++-------
1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8e57ac03a0e8..03abf9abf681 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -595,6 +595,51 @@ void __kprobes text_poke_or_die(void *addr, const void *opcode, size_t len)
BUG_ON(err);
static void do_sync_core(void *info)
{
sync_core();
@@ -759,7 +804,7 @@ static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,

if (likely(!ret))
/* write the breakpoint */
- ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
+ ret = text_poke_part(addr, &bp_int3, sizeof(bp_int3));

return ret;
}
@@ -777,9 +822,9 @@ static int update_iter_code(struct text_poke_bp_iter *iterator,

/* write all but the first byte */
addr = iterator->get_addr(iter);
- return text_poke(addr + sizeof(bp_int3),
- opcode + sizeof(bp_int3),
- bp_int3_len - sizeof(bp_int3));
+ return text_poke_part(addr + sizeof(bp_int3),
+ opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
}

static int finish_iter_update(struct text_poke_bp_iter *iterator,
@@ -793,7 +838,7 @@ static int finish_iter_update(struct text_poke_bp_iter *iterator,
if (opcode) {
/* write the first byte if defined */
addr = iterator->get_addr(iter);
- ret = text_poke(addr, opcode, sizeof(bp_int3));
+ ret = text_poke_part(addr, opcode, sizeof(bp_int3));
}

/* Patching has finished. Let's update the status flag if any. */
@@ -829,9 +874,9 @@ static void recover_iter(struct text_poke_bp_iter *iterator,
if (memcmp(actual + sizeof(bp_int3),
old_opcode + sizeof(bp_int3),
bp_int3_len - sizeof(bp_int3)) != 0) {
- err = text_poke(addr + sizeof(bp_int3),
- old_opcode + sizeof(bp_int3),
- bp_int3_len - sizeof(bp_int3));
+ err = text_poke_part(addr + sizeof(bp_int3),
+ old_opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
/* we should not continue if recovering has failed */
BUG_ON(err);
/*
@@ -843,7 +888,7 @@ static void recover_iter(struct text_poke_bp_iter *iterator,
}

/* Finally, put back the first byte from the old code */
- err = text_poke(addr, old_opcode, sizeof(bp_int3));
+ err = text_poke_part(addr, old_opcode, sizeof(bp_int3));
/* we can not continue if the interrupt is still there */
BUG_ON(err);
}
@@ -871,7 +916,9 @@ static void recover_iter(struct text_poke_bp_iter *iterator,
* It is a bit more paranoid than text_poke_bp because it checks the actual
* code before patching. This is a good practice proposed by the ftrace code.
*
- * Note: This function must be called under text_mutex.
+ * Note: This function must be called under text_mutex. Also the caller is
+ * responsible for making the patched code read-write, for example using
+ * set_kernel_text_rw() and set_all_modules_text_rw()
*/
int text_poke_bp_list(struct text_poke_bp_iter *iterator)
{

Petr Mladek

unread,
Dec 10, 2013, 10:50:01 AM12/10/13
to
probe_kernel_read is used when modifying function calls in ftrace_replace_code,
see arch/x86/kernel/ftrace.c. On x86, the code is replaced using int3 guard.
All functions are patched in parallel to reduce an expensive synchronization
of all CPUs. The result is that all affected functions are called via
ftrace_int3_handler during the process.

ftrace_int3_handler is relatively slow because it has to check whether
it is responsible for the handled IP. Therefore we should not modify
functions that used during patching. It would slowdown the patching.

I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt,
irqsoff, function, and nop. Every tracer has also been enabled and disabled.
With 500 cycles, I got these times before this commit:

real 18m2.477s 18m8.654s 18m9.196s
user 0m0.008s 0m0.008s 0m0.012s
sys 0m17.316s 0m17.104s 0m17.300s

and after this commit:

real 16m14.390s 16m15.200s 16m19.632s
user 0m0.028s 0m0.024s 0m0.028s
sys 0m23.788s 0m23.812s 0m23.804s

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
mm/maccess.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/maccess.c b/mm/maccess.c
index d53adf9ba84b..bed9ee854ea0 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -18,7 +18,7 @@
long __weak probe_kernel_read(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_read")));

-long __probe_kernel_read(void *dst, const void *src, size_t size)
+long notrace __probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();

Petr Mladek

unread,
Dec 10, 2013, 10:50:02 AM12/10/13
to
The commit fd4363fff3d9 (x86: Introduce int3 (breakpoint)-based instruction
patching) uses the same technique that has been used in ftrace since 08d636b
("ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine")

When trying to use text_poke_bp in dynamic ftrace, the process was significantly
slower than the original code. For example, I tried to switch between 7 tracers:
blk, branch, function_graph, wakeup_rt, irqsoff, function, and nop. Every tracer
has also been enabled and disabled. With 500 cycles, I got these times with the
original code:

real 16m14.390s 16m15.200s 16m19.632s
user 0m0.028s 0m0.024s 0m0.028s
sys 0m23.788s 0m23.812s 0m23.804s

and with text_poke_bp:

real 29m45.785s 29m47.504s 29m44.016
user 0m0.004s 0m0.004s 0m0.004s
Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/include/asm/alternative.h | 36 ++++++
arch/x86/kernel/alternative.c | 257 +++++++++++++++++++++++++++++++++++--
2 files changed, 280 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 586747f5f41d..82ffe7e1529c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -232,4 +232,40 @@ extern int text_poke_bp(void *addr, const void *opcode, size_t len,
extern void text_poke_bp_or_die(void *addr, const void *opcode, size_t len,
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6436beec7b0c..8e57ac03a0e8 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,7 @@
#include <linux/stringify.h>
#include <linux/kprobes.h>
#include <linux/mm.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
#include <linux/stop_machine.h>
@@ -610,8 +611,11 @@ static void run_sync(void)
on_each_cpu(do_sync_core, NULL, 1);
}

+static char bp_int3;
static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;
+static size_t bp_int3_len;
+static void *(*bp_int3_is_handled)(const unsigned long ip);

int poke_int3_handler(struct pt_regs *regs)
{
@@ -621,14 +625,23 @@ int poke_int3_handler(struct pt_regs *regs)
if (likely(!bp_patching_in_progress))
return 0;

- if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ if (user_mode_vm(regs))
return 0;

- /* set up the specified breakpoint handler */
- regs->ip = (unsigned long) bp_int3_handler;
+ /* Check if address is handled by text_poke_bp */
+ if (bp_int3_handler && regs->ip == (unsigned long)bp_int3_addr) {
+ regs->ip = (unsigned long)bp_int3_handler;
+ return 1;
+ }

- return 1;
+ /* Check if address is handled by text_poke_bp_list */
+ if (bp_int3_is_handled && bp_int3_is_handled(regs->ip)) {
+ /* just skip the instruction */
+ regs->ip += bp_int3_len - 1;
+ return 1;
+ }

+ return 0;
}

/**
@@ -655,11 +668,13 @@ int poke_int3_handler(struct pt_regs *regs)
*/
int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
- unsigned char int3 = 0xcc;
- int ret = 0;
+ int ret;

+ bp_int3 = 0xcc;
bp_int3_handler = handler;
- bp_int3_addr = (u8 *)addr + sizeof(int3);
+ bp_int3_addr = (u8 *)addr + sizeof(bp_int3);
+ bp_int3_len = len;
+ bp_int3_is_handled = NULL;
bp_patching_in_progress = true;
/*
* Corresponding read barrier in int3 notifier for
@@ -668,20 +683,20 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
*/
smp_wmb();

- ret = text_poke(addr, &int3, sizeof(int3));
+ ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
if (unlikely(ret))
goto fail;

run_sync();

- if (len - sizeof(int3) > 0) {
+ if (len - sizeof(bp_int3) > 0) {
/*
* Patch all but the first byte. We do not know how to recover
* from an error at this stage.
*/
- text_poke_or_die((char *)addr + sizeof(int3),
- (const char *) opcode + sizeof(int3),
- len - sizeof(int3));
+ text_poke_or_die((char *)addr + sizeof(bp_int3),
+ (const char *) opcode + sizeof(bp_int3),
+ len - sizeof(bp_int3));
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
@@ -691,7 +706,7 @@ int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
}

/* Patch the first byte. We do not know how to recover from an error. */
- text_poke_or_die(addr, opcode, sizeof(int3));
+ text_poke_or_die(addr, opcode, sizeof(bp_int3));

run_sync();

@@ -710,3 +725,219 @@ void text_poke_bp_or_die(void *addr, const void *opcode, size_t len,
err = text_poke_bp(addr, opcode, len, handler);
BUG_ON(err);
}
+
+static int text_poke_check(void *addr, const void *opcode, size_t len)
+{
+ unsigned char actual[MAX_PATCH_LEN];
+ int ret;
+
+ ret = probe_kernel_read(actual, addr, len);
+ if (unlikely(ret))
+ return -EFAULT;
+
+ ret = memcmp(actual, opcode, len);
+ if (unlikely(ret))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
+ void *iter)
+{
+ void *addr;
+ const void *old_opcode;
+ int ret = 0;
+
+ /* nope if the code is not defined */
+ old_opcode = iterator->get_old_opcode(iter);
+ if (!old_opcode)
+ return 0;
+
+ addr = iterator->get_addr(iter);
+ ret = text_poke_check(addr, old_opcode, bp_int3_len);
+
+ if (likely(!ret))
+ /* write the breakpoint */
+ ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
+
+ return ret;
+}
+
+static int update_iter_code(struct text_poke_bp_iter *iterator,
+ void *iter)
+{
+ void *addr;
+ const void *opcode;
+
+ /* nope if the code is not defined */
+ opcode = iterator->get_opcode(iter);
+ if (!opcode)
+ return 0;
+
+ /* write all but the first byte */
+ addr = iterator->get_addr(iter);
+ return text_poke(addr + sizeof(bp_int3),
+ opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
+}
+
+static int finish_iter_update(struct text_poke_bp_iter *iterator,
+ void *iter)
+{
+ void *addr;
+ const void *opcode;
+ int ret = 0;
+
+ opcode = iterator->get_opcode(iter);
+ if (opcode) {
+ /* write the first byte if defined */
+ addr = iterator->get_addr(iter);
+ ret = text_poke(addr, opcode, sizeof(bp_int3));
+ }
+ if (memcmp(actual + sizeof(bp_int3),
+ old_opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3)) != 0) {
+ err = text_poke(addr + sizeof(bp_int3),
+ old_opcode + sizeof(bp_int3),
+ bp_int3_len - sizeof(bp_int3));
+ * It is a bit more paranoid than text_poke_bp because it checks the actual
+ * code before patching. This is a good practice proposed by the ftrace code.
+ *
+ * Note: This function must be called under text_mutex.

Borislav Petkov

unread,
Dec 10, 2013, 10:50:02 AM12/10/13
to
On Tue, Dec 10, 2013 at 04:42:14PM +0100, Petr Mladek wrote:
> We would like to use text_poke_bp in ftrace. It might be called also during
> boot when there is only one CPU and we do not need to sync the others.
>
> The check is must to have because there are also disabled interrupts during
> the boot. Then the call would cause a deadlock, see the warning in
> "smp_call_function_many", kernel/smp.c:371.
>
> The change is inspired by the code in arch/x86/kernel/ftrace.c.
>
> Signed-off-by: Petr Mladek <pml...@suse.cz>
> Reviewed-by: Masami Hiramatsu <masami.hi...@hitachi.com>
> ---
> arch/x86/kernel/alternative.c | 17 ++++++++++++++---
> 1 file changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index eabed9326d2a..6436beec7b0c 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -599,6 +599,17 @@ static void do_sync_core(void *info)
> sync_core();
> }
>
> +static void run_sync(void)

Can we call this sync_cores()?

It is what it does. :)

> +{
> + /*
> + * We do not need to sync other cores during boot when there is only one
> + * CPU enabled. In fact, we must not because there are also disabled
> + * interrupts. The call would fail because of a potential deadlock.
> + */
> + if (num_online_cpus() != 1)
> + on_each_cpu(do_sync_core, NULL, 1);
> +}

--
Regards/Gruss,
Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

Petr Mladek

unread,
Dec 10, 2013, 10:50:02 AM12/10/13
to
The text_poke functions called BUG() in case of error. This was too strict.
There are situations when the system is still usable even when the patching
has failed, for example when enabling the dynamic ftrace.

This commit modifies text_poke and text_poke_bp functions to return an error
code instead of calling BUG(). They used to return the patched address. But
the address was just copied from the first parameter. It was no extra
information and it has not been used anywhere yet.

There are some situations where it is hard to recover from an error. Masami
Hiramatsu <masami.hi...@hitachi.com> suggested to create
text_poke*_or_die() variants for this purpose.

Last but not least, we modify return value of the text_poke_early() function.
It is not capable of returning an error code. Let's return void to make
it clear.

Finally, we also need to modify the few locations where text_poke functions
were used and the error code has to be handled now.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/include/asm/alternative.h | 10 +++++--
arch/x86/kernel/alternative.c | 61 ++++++++++++++++++++++++++------------
arch/x86/kernel/jump_label.c | 7 +++--
arch/x86/kernel/kgdb.c | 11 +++++--
arch/x86/kernel/kprobes/core.c | 5 ++--
arch/x86/kernel/kprobes/opt.c | 8 ++---
6 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0a3f9c9f98d5..586747f5f41d 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -208,7 +208,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
#define __parainstructions_end NULL
#endif

-extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+extern void text_poke_early(void *addr, const void *opcode, size_t len);

/*
* Clear and restore the kernel write-protection flag on the local CPU.
@@ -224,8 +224,12 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
-extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern int text_poke(void *addr, const void *opcode, size_t len);
+extern void text_poke_or_die(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
-extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+extern int text_poke_bp(void *addr, const void *opcode, size_t len,
+ void *handler);
+extern void text_poke_bp_or_die(void *addr, const void *opcode, size_t len,
+ void *handler);

#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index df94598ad05a..eabed9326d2a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -242,7 +242,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)

extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
-void *text_poke_early(void *addr, const void *opcode, size_t len);
+void text_poke_early(void *addr, const void *opcode, size_t len);

/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
@@ -304,7 +304,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
continue;
/* turn DS segment override prefix into lock prefix */
if (*ptr == 0x3e)
- text_poke(ptr, ((unsigned char []){0xf0}), 1);
+ text_poke_or_die(ptr, ((unsigned char []){0xf0}), 1);
}
mutex_unlock(&text_mutex);
}
@@ -322,7 +322,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
continue;
/* turn lock prefix into DS segment override prefix */
if (*ptr == 0xf0)
- text_poke(ptr, ((unsigned char []){0x3E}), 1);
+ text_poke_or_die(ptr, ((unsigned char []){0x3E}), 1);
}
mutex_unlock(&text_mutex);
}
@@ -522,10 +522,10 @@ void __init alternative_instructions(void)
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
* Also no thread must be currently preempted in the middle of these
- * instructions. And on the local CPU you need to be protected again NMI or MCE
- * handlers seeing an inconsistent instruction while you patch.
+ * instructions. And on the local CPU you need to protect NMI and MCE
+ * handlers against seeing an inconsistent instruction while you patch.
*/
-void *__init_or_module text_poke_early(void *addr, const void *opcode,
+void __init_or_module text_poke_early(void *addr, const void *opcode,
size_t len)
{
unsigned long flags;
@@ -535,7 +535,6 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
local_irq_restore(flags);
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
- return addr;
}

/**
@@ -551,7 +550,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
*
* Note: Must be called under text_mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+int __kprobes text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
@@ -566,7 +565,8 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
WARN_ON(!PageReserved(pages[0]));
pages[1] = virt_to_page(addr + PAGE_SIZE);
}
- BUG_ON(!pages[0]);
+ if (unlikely(!pages[0]))
+ return -EFAULT;
local_irq_save(flags);
set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
if (pages[1])
@@ -583,7 +583,15 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
for (i = 0; i < len; i++)
BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
local_irq_restore(flags);
- return addr;
+ return 0;
+}
+
+void __kprobes text_poke_or_die(void *addr, const void *opcode, size_t len)
+{
+ int err;
+
+ err = text_poke(addr, opcode, len);
+ BUG_ON(err);
}

static void do_sync_core(void *info)
@@ -634,9 +642,10 @@ int poke_int3_handler(struct pt_regs *regs)
*
* Note: must be called under text_mutex.
*/
-void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+int text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
unsigned char int3 = 0xcc;
+ int ret = 0;

bp_int3_handler = handler;
bp_int3_addr = (u8 *)addr + sizeof(int3);
@@ -648,15 +657,20 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
*/
smp_wmb();

- text_poke(addr, &int3, sizeof(int3));
+ ret = text_poke(addr, &int3, sizeof(int3));
+ if (unlikely(ret))
+ goto fail;

on_each_cpu(do_sync_core, NULL, 1);

if (len - sizeof(int3) > 0) {
- /* patch all but the first byte */
- text_poke((char *)addr + sizeof(int3),
- (const char *) opcode + sizeof(int3),
- len - sizeof(int3));
+ /*
+ * Patch all but the first byte. We do not know how to recover
+ * from an error at this stage.
+ */
+ text_poke_or_die((char *)addr + sizeof(int3),
+ (const char *) opcode + sizeof(int3),
+ len - sizeof(int3));
/*
* According to Intel, this core syncing is very likely
* not necessary and we'd be safe even without it. But
@@ -665,14 +679,23 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
on_each_cpu(do_sync_core, NULL, 1);
}

- /* patch the first byte */
- text_poke(addr, opcode, sizeof(int3));
+ /* Patch the first byte. We do not know how to recover from an error. */
+ text_poke_or_die(addr, opcode, sizeof(int3));

on_each_cpu(do_sync_core, NULL, 1);

+fail:
bp_patching_in_progress = false;
smp_wmb();

- return addr;
+ return ret;
}

+void text_poke_bp_or_die(void *addr, const void *opcode, size_t len,
+ void *handler)
+{
+ int err;
+
+ err = text_poke_bp(addr, opcode, len, handler);
+ BUG_ON(err);
+}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 26d5a55a2736..d87b2946a121 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -38,7 +38,7 @@ static void bug_at(unsigned char *ip, int line)

static void __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
- void *(*poker)(void *, const void *, size_t),
+ void (*poker)(void *, const void *, size_t),
int init)
{
union jump_code_union code;
@@ -98,8 +98,9 @@ static void __jump_label_transform(struct jump_entry *entry,
if (poker)
(*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
else
- text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
- (void *)entry->code + JUMP_LABEL_NOP_SIZE);
+ text_poke_bp_or_die((void *)entry->code, &code,
+ JUMP_LABEL_NOP_SIZE,
+ (void *)entry->code + JUMP_LABEL_NOP_SIZE);
}

index 79a3f9682871..8cb9b709661b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -409,12 +409,13 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)

void __kprobes arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+ text_poke_or_die(p->addr,
+ ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}

void __kprobes arch_disarm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, &p->opcode, 1);
+ text_poke_or_die(p->addr, &p->opcode, 1);
}

void __kprobes arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 898160b42e43..1e4fee746517 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -390,8 +390,8 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)
insn_buf[0] = RELATIVEJUMP_OPCODE;
*(s32 *)(&insn_buf[1]) = rel;

- text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ text_poke_bp_or_die(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);

list_del_init(&op->list);
}
@@ -405,8 +405,8 @@ void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
/* Set int3 to first byte for kprobes */
insn_buf[0] = BREAKPOINT_INSTRUCTION;
memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
- text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
- op->optinsn.insn);
+ text_poke_bp_or_die(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
}

/*
--
1.8.4

Petr Mladek

unread,
Dec 10, 2013, 10:50:03 AM12/10/13
to
Let's continue with replacing the existing Int3-based framework in ftrace with
the new generic function introduced by the commit fd4363fff3d9 (x86: Introduce
int3 (breakpoint)-based instruction patching)

This time use it in ftrace_replace_code that modifies all the watched function
calls.

If we use text_poke_bp in ftrace_make_nop, ftrace_make_call, ftrace_modify_call,
it would be possible to get rid of the x86-specific ftrace_replace_code
implementation and use the generic code.

This would be really lovely change. Unfortunately, the code would be slow
because syncing on each CPU is relatively expensive operation. For example,
I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt,
irqsoff, function, and nop. Every tracer has also been enabled and disabled.
With 500 cycles, I got these times with the original code:

real 16m14.390s 16m15.200s 16m19.632s
user 0m0.028s 0m0.024s 0m0.028s
sys 0m23.788s 0m23.812s 0m23.804s

It slowed down after using text_poke_bp for each record separately:

real 29m45.785s 29m47.504s 29m44.016
user 0m0.004s 0m0.004s 0m0.004s
sys 0m15.776s 0m16.088s 0m16.192s

Hence, we implemented the more complex text_poke_bp_iter. When we use it,
we get the times:

real 17m19.055s 17m29.200s 17m27.456
user 0m0.052s 0m0.056s 0m0.052s
sys 0m20.236s 0m20.068s 0m20.264s

It is slightly slower than the ftrace-specific code. Well, this is the
cost for using a generic API that can be used also in other locations.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/kernel/alternative.c | 14 +-
arch/x86/kernel/ftrace.c | 428 +++++++-----------------------------------
arch/x86/kernel/traps.c | 10 -
include/linux/ftrace.h | 6 -
4 files changed, 80 insertions(+), 378 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 03abf9abf681..e5c02af3a8cc 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -613,7 +613,7 @@ void __kprobes text_poke_or_die(void *addr, const void *opcode, size_t len)
* responsible for setting the patched code read-write. This is more effective
* only when you patch many addresses at the same time.
*/
-static int text_poke_part(void *addr, const void *opcode, size_t len)
+static int notrace text_poke_part(void *addr, const void *opcode, size_t len)
{
int ret;

@@ -662,7 +662,7 @@ static void *bp_int3_handler, *bp_int3_addr;
static size_t bp_int3_len;
static void *(*bp_int3_is_handled)(const unsigned long ip);

-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
{
/* bp_patching_in_progress */
smp_rmb();
@@ -771,7 +771,7 @@ void text_poke_bp_or_die(void *addr, const void *opcode, size_t len,
BUG_ON(err);
}

-static int text_poke_check(void *addr, const void *opcode, size_t len)
+static int notrace text_poke_check(void *addr, const void *opcode, size_t len)
{
unsigned char actual[MAX_PATCH_LEN];
int ret;
@@ -787,7 +787,7 @@ static int text_poke_check(void *addr, const void *opcode, size_t len)
return 0;
}

-static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
+static int notrace add_iter_breakpoint(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
@@ -809,7 +809,7 @@ static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
return ret;
}

-static int update_iter_code(struct text_poke_bp_iter *iterator,
+static int notrace update_iter_code(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
@@ -827,7 +827,7 @@ static int update_iter_code(struct text_poke_bp_iter *iterator,
bp_int3_len - sizeof(bp_int3));
}

-static int finish_iter_update(struct text_poke_bp_iter *iterator,
+static int notrace finish_iter_update(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
@@ -848,7 +848,7 @@ static int finish_iter_update(struct text_poke_bp_iter *iterator,
return ret;
}

-static void recover_iter(struct text_poke_bp_iter *iterator,
+static void notrace recover_iter(struct text_poke_bp_iter *iterator,
void *iter)
{
void *addr;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5ade40e4a175..92fe8cac0802 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -127,23 +127,6 @@ static const unsigned char *ftrace_nop_replace(void)
}

static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = ftrace_check_code(ip, old_code);
-
- /* replace the text with the new text */
- if (!ret && do_ftrace_mod_code(ip, new_code))
- return -EPERM;
-
- sync_core();
-
- return 0;
-}
-
-static int
ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code)
{
@@ -168,20 +151,7 @@ int ftrace_make_nop(struct module *mod,
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();

- /*
- return 0;
-
- if (!ftrace_location(regs->ip - 1))
- return 0;
-
- regs->ip += MCOUNT_INSN_SIZE - 1;
-
- return 1;
-}
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return probe_kernel_write((void *)ip, val, size);
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
-
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
-
- if (ftrace_write(ip, &brk, 1))
- return -EPERM;
-
- return 0;
-}
-
- return 0;
-
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_brk_on_nop(rec);
-
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
- case FTRACE_UPDATE_MODIFY_CALL:
- ftrace_addr = get_ftrace_old_addr(rec);
- /* fall through */
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_brk_on_call(rec, ftrace_addr);
- }
- return 0;
-}
+struct ftrace_tp_iter {
+ struct ftrace_rec_iter *rec_iter;
+ struct dyn_ftrace *rec;
+ int enable;
+};

-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
+static struct ftrace_tp_iter *ftrace_tp_set_record(struct ftrace_tp_iter *tp_iter)
{
- unsigned char ins[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
- const unsigned char *nop;
- unsigned long ftrace_addr;
- unsigned long ip = rec->ip;
-
- /* If we fail the read, just give up */
- if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ if (!tp_iter->rec_iter)
+ return NULL;

- /* If this does not have a breakpoint, we are done */
- if (ins[0] != brk)
- return -1;
-
- nop = ftrace_nop_replace();
-
- /*
- * If the last 4 bytes of the instruction do not match
- * a nop, then we assume that this is a call to ftrace_addr.
- */
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
- /*
- * For extra paranoidism, we check if the breakpoint is on
- * a call that would actually jump to the ftrace_addr.
- * If not, don't touch the breakpoint, we make just create
- * a disaster.
- */
- ftrace_addr = get_ftrace_addr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
- goto update;
-
- /* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = get_ftrace_old_addr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
- return -EINVAL;
- }
-
-
- return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
-
- if (ftrace_write(ip, new, 1))
- return -EPERM;
-
- return 0;
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
-
- if (ftrace_write(ip, new, 1))
- return -EPERM;
- return 0;
}

-static int finish_update(struct dyn_ftrace *rec, int enable)
+const void *ftrace_tp_iter_get_old_opcode(void *cur)
{
- unsigned long ftrace_addr;
+ struct ftrace_tp_iter *tp_iter = cur;
+ unsigned long old_addr;
int ret;

- ret = ftrace_update_record(rec, enable);
-
- ftrace_addr = get_ftrace_addr(rec);
+ ret = ftrace_test_record(tp_iter->rec, tp_iter->enable);

switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-

Petr Mladek

unread,
Dec 10, 2013, 10:50:02 AM12/10/13
to
This resubmit primary adds text_poke*_or_die wrappers as suggested by
Masami Hiramatsu. See below for more details. It basically modifies
only the first patch in the set.

The commit fd4363fff3d9 (x86: Introduce int3 (breakpoint)-based instruction
patching) uses the same technique that has been used in ftrace since 08d636b
("ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine")

This patch set merge the two implementations and remove duplicities from
the ftrace side.

+ First four patches improve the generic int3-based framework to be usable
with ftrace. All the changes are based on ideas already used in ftrace.
They are needed to keep the functionality and efficiency.

+ The 5th patch speedups the original ftrace code but it is useful also
with the generic functions.

+ The last three patches modifies different parts of the current
x86-specific ftrace implementation and use the generic functions there.


Changes in v6:
+ added text_poke_or_die and text_poke_bp_or_die wrappers as suggested
by Masami Hiramatsu
+ modified text_poke_early to return void instead of zero to make it
clear that it did not return any real error code
+ used the text_poke*_or_die functions where appropriate
I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt,
irqsoff, function, and nop. Every tracer has also been enabled and disabled.
With 500 cycles, I got these times before the change:

real 18m2.477s 18m8.654s 18m9.196s
user 0m0.008s 0m0.008s 0m0.012s
sys 0m17.316s 0m17.104s 0m17.300s

and after

real 17m19.055s 17m29.200s 17m27.456
user 0m0.052s 0m0.056s 0m0.052s
sys 0m20.236s 0m20.068s 0m20.264s

The patches are against kernel/git/tip/tip.git on top of the commit
631d5ea8b461f906f87c864f1c27db6eb5211591 (Merge branch 'linus')

Petr Mladek (8):
x86: allow to handle errors in text_poke function family
x86: allow to call text_poke_bp during boot
x86: add generic function to modify more calls using int3 framework
x86: speed up int3-based patching using direct write
x86: do not trace __probe_kernel_read
x86: modify ftrace function using the new int3-based framework
x86: patch all traced function calls using the int3-based framework
x86: enable/disable ftrace graph call using new int3-based framework

arch/x86/include/asm/alternative.h | 46 ++-
arch/x86/kernel/alternative.c | 372 +++++++++++++++++++++--
arch/x86/kernel/ftrace.c | 583 ++++++++-----------------------------
arch/x86/kernel/jump_label.c | 7 +-
arch/x86/kernel/kgdb.c | 11 +-
arch/x86/kernel/kprobes/core.c | 5 +-
arch/x86/kernel/kprobes/opt.c | 8 +-
arch/x86/kernel/traps.c | 10 -
include/linux/ftrace.h | 6 -
mm/maccess.c | 2 +-
10 files changed, 525 insertions(+), 525 deletions(-)

Petr Mladek

unread,
Dec 10, 2013, 10:50:02 AM12/10/13
to
The commit fd4363fff3d9 (x86: Introduce int3 (breakpoint)-based instruction
patching) uses the same technique that has been used in ftrace since 08d636b
("ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine")

It would be great to use the new generic implementation and clean up the x86
ftrace code a bit.

Let's start with ftrace_modify_code. It does basically the same as text_poke_bp.
In addition, it checks whether the replaced code is the expected one to avoid
problems with modules. The checking code can be split from
ftrace_modify_code_direct.

Note that we do not longer need to set modifying_ftrace_code in
ftrace_update_ftrace_func. The int3 interrupt will be handled by
poke_int3_handler.

Signed-off-by: Petr Mladek <pml...@suse.cz>
---
arch/x86/kernel/ftrace.c | 120 +++++++++++++++++++----------------------------
1 file changed, 47 insertions(+), 73 deletions(-)

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d4bdd253fea7..5ade40e4a175 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -52,6 +52,33 @@ union ftrace_code_union {
} __attribute__((packed));
};

+/*
+ * Note: Due to modules and __init, code can
+ * disappear and change, we need to protect against faulting
+ * as well as code changing. We do this by using the
+ * probe_kernel_* functions.
+ */
+static int
+ftrace_check_code(unsigned long ip, unsigned const char *expected)
+{
+ unsigned char actual[MCOUNT_INSN_SIZE];
+
+ if (probe_kernel_read(actual, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ if (memcmp(actual, expected, MCOUNT_INSN_SIZE) != 0) {
+ WARN_ONCE(1,
+ "ftrace check failed: %x %x%x%x%x vs. %x %x%x%x%x\n",
+ actual[0],
+ actual[1], actual[2], actual[3], actual[4],
+ expected[0],
+ expected[1], expected[2], expected[3], expected[4]);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ftrace_calc_offset(long ip, long addr)
{
return (int)(addr - ip);
@@ -103,28 +130,12 @@ static int
ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code)
{
- unsigned char replaced[MCOUNT_INSN_SIZE];
-
- /*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
- */
+ int ret;

- /* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
+ ret = ftrace_check_code(ip, old_code);

/* replace the text with the new text */
- if (do_ftrace_mod_code(ip, new_code))
+ if (!ret && do_ftrace_mod_code(ip, new_code))
return -EPERM;

sync_core();
@@ -132,6 +143,22 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
return 0;
}

+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = ftrace_check_code(ip, old_code);
+
+ /* replace the text with the new text */
+ if (!ret)
+ ret = text_poke_bp((void *)ip, new_code, MCOUNT_INSN_SIZE,
+ (void *)ip + MCOUNT_INSN_SIZE);
+
+ return ret;
+}
+
int ftrace_make_nop(struct module *mod,
struct dyn_ftrace *rec, unsigned long addr)
{
@@ -202,10 +229,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
*/
atomic_t modifying_ftrace_code __read_mostly;

-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code);
-
/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
@@ -230,9 +253,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
new = ftrace_call_replace(ip, (unsigned long)func);

- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ret = ftrace_modify_code(ip, old, new);

/* Also update the regs callback function */
@@ -243,20 +263,9 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
ret = ftrace_modify_code(ip, old, new);
}

- atomic_dec(&modifying_ftrace_code);
-
return ret;
}

-static int is_ftrace_caller(unsigned long ip)
-{
- if (ip == (unsigned long)(&ftrace_call) ||
- ip == (unsigned long)(&ftrace_regs_call))
- return 1;
-
- return 0;
-}
-
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
@@ -266,13 +275,10 @@ static int is_ftrace_caller(unsigned long ip)
*/
int ftrace_int3_handler(struct pt_regs *regs)
{
- unsigned long ip;
-
if (WARN_ON_ONCE(!regs))
return 0;

- ip = regs->ip - 1;
- if (!ftrace_location(ip) && !is_ftrace_caller(ip))
+ if (!ftrace_location(regs->ip - 1))
return 0;

regs->ip += MCOUNT_INSN_SIZE - 1;
@@ -621,38 +627,6 @@ void ftrace_replace_code(int enable)
}
}

-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = add_break(ip, old_code);
- if (ret)
- goto out;
-
- run_sync();
-
- ret = add_update_code(ip, new_code);
- if (ret)
- goto fail_update;
-
- run_sync();
-
- ret = ftrace_write(ip, new_code, 1);
- if (ret) {
- ret = -EPERM;
- goto out;
- }
- run_sync();
- out:
- return ret;
-
- fail_update:
- probe_kernel_write((void *)ip, &old_code[0], 1);
- goto out;
-}
-
void arch_ftrace_update_code(int command)
{
/* See comment above by declaration of modifying_ftrace_code */

Borislav Petkov

unread,
Dec 10, 2013, 11:10:02 AM12/10/13
to
On Tue, Dec 10, 2013 at 11:01:57AM -0500, Steven Rostedt wrote:
> That can be changed in a later patch if needed. It's a static function
> so the visibility is limited. Not to mention, it was just moved from
> ftrace.c which calls it run_sync().

Moved?

The diffstat has only

arch/x86/kernel/alternative.c | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)

or is the move in yet another patch?

> We need to keep the same name for the move to not confuse the history
> of where it came from.

Whatever makes you ftrace dudes happy. :-)

Steven Rostedt

unread,
Dec 10, 2013, 11:10:01 AM12/10/13
to
On Tue, 10 Dec 2013 16:46:12 +0100
Borislav Petkov <b...@alien8.de> wrote:

> On Tue, Dec 10, 2013 at 04:42:14PM +0100, Petr Mladek wrote:
> > We would like to use text_poke_bp in ftrace. It might be called also during
> > boot when there is only one CPU and we do not need to sync the others.
> >
> > The check is must to have because there are also disabled interrupts during
> > the boot. Then the call would cause a deadlock, see the warning in
> > "smp_call_function_many", kernel/smp.c:371.
> >
> > The change is inspired by the code in arch/x86/kernel/ftrace.c.
> >
> > Signed-off-by: Petr Mladek <pml...@suse.cz>
> > Reviewed-by: Masami Hiramatsu <masami.hi...@hitachi.com>
> > ---
> > arch/x86/kernel/alternative.c | 17 ++++++++++++++---
> > 1 file changed, 14 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> > index eabed9326d2a..6436beec7b0c 100644
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> > @@ -599,6 +599,17 @@ static void do_sync_core(void *info)
> > sync_core();
> > }
> >
> > +static void run_sync(void)
>
> Can we call this sync_cores()?
>
> It is what it does. :)

That can be changed in a later patch if needed. It's a static function
so the visibility is limited. Not to mention, it was just moved from
ftrace.c which calls it run_sync(). We need to keep the same name for
the move to not confuse the history of where it came from.

-- Steve

>
> > +{
> > + /*
> > + * We do not need to sync other cores during boot when there is only one
> > + * CPU enabled. In fact, we must not because there are also disabled
> > + * interrupts. The call would fail because of a potential deadlock.
> > + */
> > + if (num_online_cpus() != 1)
> > + on_each_cpu(do_sync_core, NULL, 1);
> > +}
>

--

Petr Mladek

unread,
Dec 10, 2013, 11:50:01 AM12/10/13
to
Borislav Petkov píše v Út 10. 12. 2013 v 17:08 +0100:
> On Tue, Dec 10, 2013 at 11:01:57AM -0500, Steven Rostedt wrote:
> > That can be changed in a later patch if needed. It's a static function
> > so the visibility is limited. Not to mention, it was just moved from
> > ftrace.c which calls it run_sync().
>
> Moved?
>
> The diffstat has only
>
> arch/x86/kernel/alternative.c | 17 ++++++++++++++---
> 1 file changed, 14 insertions(+), 3 deletions(-)
>
> or is the move in yet another patch?

Yes, it is removed from arch/x86/kernel/ftrace.c in the 7th patch of
this patch set.

Best Regards,
Petr

Masami Hiramatsu

unread,
Dec 10, 2013, 10:00:02 PM12/10/13
to
(2013/12/11 0:42), Petr Mladek wrote:
> The text_poke functions called BUG() in case of error. This was too strict.
> There are situations when the system is still usable even when the patching
> has failed, for example when enabling the dynamic ftrace.
>
> This commit modifies text_poke and text_poke_bp functions to return an error
> code instead of calling BUG(). They used to return the patched address. But
> the address was just copied from the first parameter. It was no extra
> information and it has not been used anywhere yet.
>
> There are some situations where it is hard to recover from an error. Masami
> Hiramatsu <masami.hi...@hitachi.com> suggested to create
> text_poke*_or_die() variants for this purpose.
>
> Last but not least, we modify return value of the text_poke_early() function.
> It is not capable of returning an error code. Let's return void to make
> it clear.
>
> Finally, we also need to modify the few locations where text_poke functions
> were used and the error code has to be handled now.
>
> Signed-off-by: Petr Mladek <pml...@suse.cz>

This looks good to me.

Acked-by: Masami Hiramatsu <masami.hi...@hitachi.com>

Thank you!
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hi...@hitachi.com


Steven Rostedt

unread,
Jan 14, 2014, 6:30:02 PM1/14/14
to

FYI, for future patches, start the subject with a capital letter. ie:
x86: Allow to handle errors in text_poke function family

On Tue, 10 Dec 2013 16:42:13 +0100
Petr Mladek <pml...@suse.cz> wrote:

> The text_poke functions called BUG() in case of error. This was too strict.
> There are situations when the system is still usable even when the patching
> has failed, for example when enabling the dynamic ftrace.
>
> This commit modifies text_poke and text_poke_bp functions to return an error
> code instead of calling BUG(). They used to return the patched address. But
> the address was just copied from the first parameter. It was no extra
> information and it has not been used anywhere yet.
>
> There are some situations where it is hard to recover from an error. Masami
> Hiramatsu <masami.hi...@hitachi.com> suggested to create
> text_poke*_or_die() variants for this purpose.

I don't like the "_or_die()". Although I don't care much about it, I'm
thinking the x86 maintainers might not like it either.

What about just doing the test in the places that would call "or_die"?

ret = text_poke*();
BUG_ON(ret);

?

-- Steve

>
> Last but not least, we modify return value of the text_poke_early() function.
> It is not capable of returning an error code. Let's return void to make
> it clear.
>
> Finally, we also need to modify the few locations where text_poke functions
> were used and the error code has to be handled now.
>
> Signed-off-by: Petr Mladek <pml...@suse.cz>
> ---

Steven Rostedt

unread,
Jan 14, 2014, 7:40:02 PM1/14/14
to
On Tue, 10 Dec 2013 16:42:15 +0100
Petr Mladek <pml...@suse.cz> wrote:

> diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
> index 586747f5f41d..82ffe7e1529c 100644
> --- a/arch/x86/include/asm/alternative.h
> +++ b/arch/x86/include/asm/alternative.h
> @@ -232,4 +232,40 @@ extern int text_poke_bp(void *addr, const void *opcode, size_t len,
> extern void text_poke_bp_or_die(void *addr, const void *opcode, size_t len,
> void *handler);

Small nit. If you can, place comments on the same line as the
structure field.

> +struct text_poke_bp_iter {
> + /* information used to start iteration from the beginning */
> + void *init;
> + /* length of the patched instruction */
> + size_t len;
> + /* details about failure if any */
> + int fail_count;
> + void *fail_addr;

The above should have the comments on the same line as the field.
Something like this:

void *init; /* information used to start
iteration from the beginning */

The comments for the function pointers below are fine.
bp_int3 is not going to be anything but 0xcc. Let's change that to:

static char bp_int3 = 0xcc;

And remove the other initializations.
We could remove this as it should be constant.
Shouldn't we be setting the bp_int3_handler back to NULL here?
The above comment does not make sense.

> + old_opcode = iterator->get_old_opcode(iter);
> + if (!old_opcode)
> + return 0;
> +
> + addr = iterator->get_addr(iter);
> + ret = text_poke_check(addr, old_opcode, bp_int3_len);
> +
> + if (likely(!ret))
> + /* write the breakpoint */

Comment is redundant and can be removed.

> + ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
> +
> + return ret;
> +}
> +
> +static int update_iter_code(struct text_poke_bp_iter *iterator,
> + void *iter)
> +{
> + void *addr;
> + const void *opcode;
> +
> + /* nope if the code is not defined */

Still does not make sense :-)
"... if we could not even read"

But this begs the question. If we fail on reading, we call the recovery
mode, and this code will run on the place that we tried to read, and
here we bug, never going back to the caller telling us where we bugged
and why.
Nit, but swap the above arguments to stay consistent with other for_*()
macros. Such as list_for_each_entry() and such. That is, the iterator
is the first parameter. Also make sure there's a space after the ','.

> + for (iter = iterator->start(iterator->init); \
> + iter; \
> + iter = iterator->next(iter))
> +
> +/**
> + * text_poke_bp_list() -- update instructions on live kernel on SMP
> + * @iterator: structure that allows to iterate over the patched addressed
> + * and get all the needed information

There's a hidden space before the two tabs above.
(checkpatch.pl complains).

> + *
> + * Modify several instructions by using int3 breakpoint on SMP in parallel.
> + * The principle is the same as in text_poke_bp. But synchronization of all CPUs
> + * is relatively slow operation, so we need to reduce it. Hence we add interrupt
> + * for all instructions before we modify the other bytes, etc.

The last sentence does not make sense. Do you mean "Hence we place a
breakpoint to all instructions before we modify the other bytes"?
This can not be a blind recovery. Only failure of adding the breakpoint
should recover, as it should do the read/compare/write test first. If
it fails after that, then it was something else and it is OK to bug.

But in any case, the recover_iter() should only be called "count"
times. We should put back the breakpoint only on those that were set,
and not try anything else.

-- Steve


> + recover_iter(iterator, iter);
> + }
> + run_sync();
> + }
> +
> + bp_patching_in_progress = false;
> + smp_wmb();
> +
> + return ret;
> +}

--

Steven Rostedt

unread,
Jan 14, 2014, 7:50:02 PM1/14/14
to
Another nit, because I'm anal. I know Intel confuses things by calling
int3 an interrupt, when in reality it is a trap. There's nothing
asynchronous about it and nothing is being interrupted. Can we use the
term "trap" or "breakpoint" instead of "interrupt"?

> + * Hence we could use faster code here. See also text_poke_bp_iter below
> + * for more details.
> + *
> + * Note: This function must be called under text_mutex. Also the caller is
> + * responsible for setting the patched code read-write. This is more effective
> + * only when you patch many addresses at the same time.

s/many/several/

> + */
> +static int text_poke_part(void *addr, const void *opcode, size_t len)
> +{
> + int ret;
> +
> + /* The patching makes sense only for a text code */

s/for a/for/

-- Steve

> + if (unlikely((unsigned long)addr < (unsigned long)_text) ||
> + unlikely((unsigned long)addr >= (unsigned long)_etext))
> + return -EFAULT;
> +
> +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
> + /*
> + * On x86_64, the kernel text mapping is always mapped read-only with
> + * CONFIG_DEBUG_RODATA. Instead, we need to use the kernel identity
> + * mapping that can be set read-write, for example using
> + * set_kernel_text_rw().
> + */
> + addr = __va(__pa_symbol(addr));
> +#endif
> +
> + ret = probe_kernel_write(addr, opcode, len);
> + if (unlikely(ret))
> + return -EPERM;
> +
> + sync_core();
> + return 0;
> +}
> +

Steven Rostedt

unread,
Jan 14, 2014, 8:00:03 PM1/14/14
to
On Tue, 10 Dec 2013 16:42:17 +0100
Petr Mladek <pml...@suse.cz> wrote:

> probe_kernel_read is used when modifying function calls in ftrace_replace_code,
> see arch/x86/kernel/ftrace.c. On x86, the code is replaced using int3 guard.
> All functions are patched in parallel to reduce an expensive synchronization
> of all CPUs. The result is that all affected functions are called via
> ftrace_int3_handler during the process.
>
> ftrace_int3_handler is relatively slow because it has to check whether
> it is responsible for the handled IP. Therefore we should not modify
> functions that used during patching. It would slowdown the patching.
>
> I tried to switch between 7 tracers: blk, branch, function_graph, wakeup_rt,
> irqsoff, function, and nop. Every tracer has also been enabled and disabled.
> With 500 cycles, I got these times before this commit:
>
> real 18m2.477s 18m8.654s 18m9.196s
> user 0m0.008s 0m0.008s 0m0.012s
> sys 0m17.316s 0m17.104s 0m17.300s
>
> and after this commit:
>
> real 16m14.390s 16m15.200s 16m19.632s
> user 0m0.028s 0m0.024s 0m0.028s
> sys 0m23.788s 0m23.812s 0m23.804s
>

Actually no. The benefit of tracing a function outweighs the benefit
of speeding up the conversion. If you want a fast conversion, simply:

echo __probe_kernel_read > /sys/kernel/debug/tracing/set_ftrace_notrace

Try your numbers again with the above line.

-- Steve

> Signed-off-by: Petr Mladek <pml...@suse.cz>
> ---
> mm/maccess.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/maccess.c b/mm/maccess.c
> index d53adf9ba84b..bed9ee854ea0 100644
> --- a/mm/maccess.c
> +++ b/mm/maccess.c
> @@ -18,7 +18,7 @@
> long __weak probe_kernel_read(void *dst, const void *src, size_t size)
> __attribute__((alias("__probe_kernel_read")));
>
> -long __probe_kernel_read(void *dst, const void *src, size_t size)
> +long notrace __probe_kernel_read(void *dst, const void *src, size_t size)
> {
> long ret;
> mm_segment_t old_fs = get_fs();

--

Steven Rostedt

unread,
Jan 14, 2014, 8:10:02 PM1/14/14
to
Actually the above comment is no longer accurate. I know that you just
copied it from my code, and this is my fault. But instead of keeping an
inaccurate comment around, lets fix it.

/*
* Note: Modifying kernel code is extremely critical to get right.
* We must try to detect any error, and we should only change code
* with code that we expect to be changed. Always test that the old
* text is what we expect it to be before we modify it, and fail
* if it is not.
*/

> + */
> +static int
> +ftrace_check_code(unsigned long ip, unsigned const char *expected)
> +{
> + unsigned char actual[MCOUNT_INSN_SIZE];
> +
> + if (probe_kernel_read(actual, (void *)ip, MCOUNT_INSN_SIZE))
> + return -EFAULT;
> +
> + if (memcmp(actual, expected, MCOUNT_INSN_SIZE) != 0) {
> + WARN_ONCE(1,
> + "ftrace check failed: %x %x%x%x%x vs. %x %x%x%x%x\n",
> + actual[0],
> + actual[1], actual[2], actual[3], actual[4],
> + expected[0],
> + expected[1], expected[2], expected[3], expected[4]);
> + return -EINVAL;

Hmm, I guess I'm fine with warning here like this, but it is also
covered in kernel/ftrace.c too, with the return code o -EINVAL.

But duplicate warnings may not be that bad.
Wait? If this fails, why not return with ret? Otherwise, we just
returned with success!

-- Steve

>
> /* replace the text with the new text */
> - if (do_ftrace_mod_code(ip, new_code))
> + if (!ret && do_ftrace_mod_code(ip, new_code))
> return -EPERM;
>
> sync_core();
> @@ -132,6 +143,22 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
> return 0;
> }
>
> +static int
> +ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
> + unsigned const char *new_code)
> +{
> + int ret;
> +
> + ret = ftrace_check_code(ip, old_code);
> +
> + /* replace the text with the new text */
> + if (!ret)
> + ret = text_poke_bp((void *)ip, new_code, MCOUNT_INSN_SIZE,
> + (void *)ip + MCOUNT_INSN_SIZE);
> +
> + return ret;
> +}
> +

Masami Hiramatsu

unread,
Jan 15, 2014, 3:20:01 AM1/15/14
to
just a comment.
If it is always 0xcc, it should be a const variable.

Thank you,

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hi...@hitachi.com


Steven Rostedt

unread,
Jan 15, 2014, 9:20:02 AM1/15/14
to
On Wed, 15 Jan 2014 17:18:01 +0900
Masami Hiramatsu <masami.hi...@hitachi.com> wrote:


> >> +static char bp_int3;
> >
> > bp_int3 is not going to be anything but 0xcc. Let's change that to:
> >
> > static char bp_int3 = 0xcc;
> >
> > And remove the other initializations.
>
> just a comment.
> If it is always 0xcc, it should be a const variable.
>

Yeah, I was thinking that too. As long as sizeof(bp_int3) still works,
which it should for just adding a "const", and not make it a macro.

static const bp_int3 = 0xcc;

Thanks!

-- Steve

Steven Rostedt

unread,
Jan 15, 2014, 10:50:01 AM1/15/14
to
On Tue, 10 Dec 2013 16:42:19 +0100
Petr Mladek <pml...@suse.cz> wrote:

>
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 03abf9abf681..e5c02af3a8cc 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -613,7 +613,7 @@ void __kprobes text_poke_or_die(void *addr, const void *opcode, size_t len)
> * responsible for setting the patched code read-write. This is more effective
> * only when you patch many addresses at the same time.
> */
> -static int text_poke_part(void *addr, const void *opcode, size_t len)
> +static int notrace text_poke_part(void *addr, const void *opcode, size_t len)

I can understand why you want to not trace any of these, but again, I
hate using notrace unless it is absolutely necessary. That's what the
set_ftrace_notrace is for. If you don't want to trace it, simply tell
the kernel not to.
The above is a nice comment. It should be copied over to the
alternative.c file for the explanation of bp_patching_in_progress.
Leave this, it's good for other archs. Which reminds me, I need to
update them.

-- Steve

> int ftrace_update_record(struct dyn_ftrace *rec, int enable);
> int ftrace_test_record(struct dyn_ftrace *rec, int enable);
> void ftrace_run_stop_machine(int command);

--

Petr Mladek

unread,
Jan 21, 2014, 8:10:01 AM1/21/14
to
On Tue, 2014-01-14 at 18:20 -0500, Steven Rostedt wrote:
> FYI, for future patches, start the subject with a capital letter. ie:
> x86: Allow to handle errors in text_poke function family
>
> On Tue, 10 Dec 2013 16:42:13 +0100
> Petr Mladek <pml...@suse.cz> wrote:
>
> > The text_poke functions called BUG() in case of error. This was too strict.
> > There are situations when the system is still usable even when the patching
> > has failed, for example when enabling the dynamic ftrace.
> >
> > This commit modifies text_poke and text_poke_bp functions to return an error
> > code instead of calling BUG(). They used to return the patched address. But
> > the address was just copied from the first parameter. It was no extra
> > information and it has not been used anywhere yet.
> >
> > There are some situations where it is hard to recover from an error. Masami
> > Hiramatsu <masami.hi...@hitachi.com> suggested to create
> > text_poke*_or_die() variants for this purpose.
>
> I don't like the "_or_die()". Although I don't care much about it, I'm
> thinking the x86 maintainers might not like it either.
>
> What about just doing the test in the places that would call "or_die"?
>
> ret = text_poke*();
> BUG_ON(ret);

Exactly this solution has been used in v5 of this patch set, see
https://lkml.org/lkml/2013/12/3/258

Masami suggested to use the "or_die()" because BUG_ON() was used on most
locations, see https://lkml.org/lkml/2013/12/6/1107

I personally do not have any strong opinion about it and will do
whatever makes x86 maintainers happy :-)

Best Regards,
Petr

Petr Mladek

unread,
Jan 21, 2014, 9:00:02 AM1/21/14
to
On Tue, 2014-01-14 at 19:33 -0500, Steven Rostedt wrote:
> On Tue, 10 Dec 2013 16:42:15 +0100
> Petr Mladek <pml...@suse.cz> wrote:
>
> > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> > index 6436beec7b0c..8e57ac03a0e8 100644
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
It might be cleaner but it is not really needed. "poke_int3_handler()"
checks "bp_int3_handler" only when "bp_patching_in_progress" is enabled.
The "in_progress" variable is disabled right after the above mentioned
"run_sync()", so we are on the safe side.

Note that the original "text_poke_bp()" implementation disabled only the
"in_progress" variable at the end as well.


> > +static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
> > + void *iter)
> > +{
> > + void *addr;
> > + const void *old_opcode;
> > + int ret = 0;
> > +
> > + /* nope if the code is not defined */
>
> The above comment does not make sense.

It is here to handle the situation when "ftrace_test_record(rec,
enable)" returns FTRACE_UPDATE_IGNORE. In this case, even the original
implementation does not add the breakpoint.

I did not want to confuse the universal implementation with extra flags.
Instead, I passed NULL "old_code" pointer when the patching was not
needed for this particular address.

I agree that it might be a bit confusing. The question is whether it is
enough to improve documentation or rather use an extra flag or so.

I am going to improve the comments unless you say otherwise.

>
> > + old_opcode = iterator->get_old_opcode(iter);
> > + if (!old_opcode)
> > + return 0;
> > +
> > + addr = iterator->get_addr(iter);
> > + ret = text_poke_check(addr, old_opcode, bp_int3_len);
> > +
> > + if (likely(!ret))
> > + /* write the breakpoint */
>
> Comment is redundant and can be removed.
>
> > + ret = text_poke(addr, &bp_int3, sizeof(bp_int3));
> > +
> > + return ret;
> > +}
> > +
> > +static int update_iter_code(struct text_poke_bp_iter *iterator,
> > + void *iter)
> > +{
> > + void *addr;
> > + const void *opcode;
> > +
> > + /* nope if the code is not defined */
>
> Still does not make sense :-)

It is the same reason/trick that is used in "add_iter_breakpoint()".
NULL code pointer means that we actually do not want to patch this
particular address.

The rest of your comments is clear. I am updating the patch set. Thanks
a lot for feedback.


Best Regards,
Petr

Steven Rostedt

unread,
Jan 21, 2014, 9:10:01 AM1/21/14
to
On Tue, 21 Jan 2014 14:00:37 +0100
Petr Mladek <pml...@suse.cz> wrote:

> > > There are some situations where it is hard to recover from an error. Masami
> > > Hiramatsu <masami.hi...@hitachi.com> suggested to create
> > > text_poke*_or_die() variants for this purpose.
> >
> > I don't like the "_or_die()". Although I don't care much about it, I'm
> > thinking the x86 maintainers might not like it either.
> >
> > What about just doing the test in the places that would call "or_die"?
> >
> > ret = text_poke*();
> > BUG_ON(ret);
>
> Exactly this solution has been used in v5 of this patch set, see
> https://lkml.org/lkml/2013/12/3/258
>
> Masami suggested to use the "or_die()" because BUG_ON() was used on most
> locations, see https://lkml.org/lkml/2013/12/6/1107

If BUG_ON() is used in most locations, then we can make text_poke()
default to bug, and the just have a text_poke_safe() function that does
not bug. Or some similar name.

The "_die" has a bad taste in several developers mouth ;-)

-- Steve

Steven Rostedt

unread,
Jan 21, 2014, 9:10:01 AM1/21/14
to
On Tue, 21 Jan 2014 14:50:15 +0100
Petr Mladek <pml...@suse.cz> wrote:

> On Tue, 2014-01-14 at 19:33 -0500, Steven Rostedt wrote:
> > >
> > > /* Patch the first byte. We do not know how to recover from an error. */
> > > - text_poke_or_die(addr, opcode, sizeof(int3));
> > > + text_poke_or_die(addr, opcode, sizeof(bp_int3));
> > >
> > > run_sync();
> >
> > Shouldn't we be setting the bp_int3_handler back to NULL here?
>
> It might be cleaner but it is not really needed. "poke_int3_handler()"

Yeah, I was stating that as more of being "cleaner". I saw that the
current code handles this, but it feels like it would be more robust to
set it to NULL now.

> checks "bp_int3_handler" only when "bp_patching_in_progress" is enabled.
> The "in_progress" variable is disabled right after the above mentioned
> "run_sync()", so we are on the safe side.
>
> Note that the original "text_poke_bp()" implementation disabled only the
> "in_progress" variable at the end as well.
>
>
> > > +static int add_iter_breakpoint(struct text_poke_bp_iter *iterator,
> > > + void *iter)
> > > +{
> > > + void *addr;
> > > + const void *old_opcode;
> > > + int ret = 0;
> > > +
> > > + /* nope if the code is not defined */
> >
> > The above comment does not make sense.
>
> It is here to handle the situation when "ftrace_test_record(rec,
> enable)" returns FTRACE_UPDATE_IGNORE. In this case, even the original
> implementation does not add the breakpoint.
>
> I did not want to confuse the universal implementation with extra flags.
> Instead, I passed NULL "old_code" pointer when the patching was not
> needed for this particular address.
>
> I agree that it might be a bit confusing. The question is whether it is
> enough to improve documentation or rather use an extra flag or so.
>
> I am going to improve the comments unless you say otherwise.

I was confused by the comment. Please add more documentation in the
comment to explain this. You can state what it is there for (for
ftrace to ignore records) without having to add extra flags. But 10
years from now, we want developers to understand why things were done
the way they were done without having to look through email archives or
git logs.
Good, please add more text to the comment to explain it better.

Thanks,

-- Steve

Masami Hiramatsu

unread,
Jan 21, 2014, 8:00:01 PM1/21/14
to
(2014/01/21 23:02), Steven Rostedt wrote:
> On Tue, 21 Jan 2014 14:00:37 +0100
> Petr Mladek <pml...@suse.cz> wrote:
>
>>>> There are some situations where it is hard to recover from an error. Masami
>>>> Hiramatsu <masami.hi...@hitachi.com> suggested to create
>>>> text_poke*_or_die() variants for this purpose.
>>>
>>> I don't like the "_or_die()". Although I don't care much about it, I'm
>>> thinking the x86 maintainers might not like it either.
>>>
>>> What about just doing the test in the places that would call "or_die"?
>>>
>>> ret = text_poke*();
>>> BUG_ON(ret);
>>
>> Exactly this solution has been used in v5 of this patch set, see
>> https://lkml.org/lkml/2013/12/3/258
>>
>> Masami suggested to use the "or_die()" because BUG_ON() was used on most
>> locations, see https://lkml.org/lkml/2013/12/6/1107
>
> If BUG_ON() is used in most locations, then we can make text_poke()
> default to bug, and the just have a text_poke_safe() function that does
> not bug. Or some similar name.

Unfortunately, since still there is BUG_ON() in text_poke() when
we failed to modify text, I think text_poke_safe() is not a good
name too.

> The "_die" has a bad taste in several developers mouth ;-)

What about using text_poke() for BUG_ON and __text_poke()
for returning an error ? This may not change caller sites.

Thank you,

--
Masami HIRAMATSU
IT Management Research Dept. Linux Technology Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hi...@hitachi.com


Steven Rostedt

unread,
Jan 21, 2014, 8:20:02 PM1/21/14
to
On Wed, 22 Jan 2014 09:52:03 +0900
Masami Hiramatsu <masami.hi...@hitachi.com> wrote:

> What about using text_poke() for BUG_ON and __text_poke()
> for returning an error ? This may not change caller sites.
>


Sounds fine with me.

-- Steve

Petr Mladek

unread,
Jan 22, 2014, 8:30:03 AM1/22/14
to
On Wed, 2014-01-15 at 10:47 -0500, Steven Rostedt wrote:
> On Tue, 10 Dec 2013 16:42:19 +0100
> Petr Mladek <pml...@suse.cz> wrote:
>
> >
> > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> > index 03abf9abf681..e5c02af3a8cc 100644
> > --- a/arch/x86/kernel/alternative.c
> > +++ b/arch/x86/kernel/alternative.c
> > @@ -613,7 +613,7 @@ void __kprobes text_poke_or_die(void *addr, const void *opcode, size_t len)
> > * responsible for setting the patched code read-write. This is more effective
> > * only when you patch many addresses at the same time.
> > */
> > -static int text_poke_part(void *addr, const void *opcode, size_t len)
> > +static int notrace text_poke_part(void *addr, const void *opcode, size_t len)
>
> I can understand why you want to not trace any of these, but again, I
> hate using notrace unless it is absolutely necessary. That's what the
> set_ftrace_notrace is for. If you don't want to trace it, simply tell
> the kernel not to.

I removed "notrace" from all locations, except for "poke_int3_handler".
Then I tried to switch between 7 tracers, enable and disable tracing,
and repeated this 100 times. It slowed down from:

real 3m25.837s 3m29.280s 3m27.408s
user 0m0.004s 0m0.004s 0m0.000s
sys 0m3.532s 0m3.460s 0m3.416s

to

real 5m23.294s 5m24.944s 5m28.056s
user 0m0.004s 0m0.004s 0m0.004s
sys 0m3.076s 0m3.180s 0m3.396s

It means a change by 60%. As you know indeed, the reason is that the
functions used during patching are called via the int3 handler once the
breakpoint is added.

I thought about patching these functions separately, e.g. using separate
list or extra filters with two round patching. But I think that it is
not worth the extra complexity.

I agree that it might be worth to keep the functions traceable. So, if
the slowness is acceptable for you, I am fine with it as well.

> > {
> > int ret;
> >
> > @@ -662,7 +662,7 @@ static void *bp_int3_handler, *bp_int3_addr;
> > static size_t bp_int3_len;
> > static void *(*bp_int3_is_handled)(const unsigned long ip);
> >
> > -int poke_int3_handler(struct pt_regs *regs)
> > +int notrace poke_int3_handler(struct pt_regs *regs)
> > {
> > /* bp_patching_in_progress */
> > smp_rmb();
[...]
Well, text_poke is using barriers instead of the atomic variable. I
think that the barriers have two functions in text_poke stuff:

1) make sure that "bp_patching_in_progress" is enabled before the
breakpoint is added; it means that "poke_int3_handler" is opened to
handle the address before the related trap happens.

2) make sure that the variables, e.g. bp_int3_handler, bp_int3_add, have
correct values before they are accessed in "poke_int3_handler".

I think that the atomic variable has only the 1st function. It does not
solve synchronization of the other variables. I thin that it was not a
problem in ftrace because it patched almost static list of addresses
but it might be problem in the generic text_poke_bp.

> > -atomic_t modifying_ftrace_code __read_mostly;
> > -
> > -/*
> > * Should never be called:
> > * As it is only called by __ftrace_replace_code() which is called by
> > * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()


Best Regards,
Petr

Petr Mladek

unread,
Jan 23, 2014, 9:30:02 AM1/23/14
to
I updated numbers in commit messages for the patchset v7. I am slightly
in doubts now. I wonder if the complex iterator-based text_poke_bp_list
is still worth the effort.

I tried to switch between 7 tracers: blk, branch, function_graph,
wakeup_rt, irqsoff, function, and nop. Every tracer has also been
enabled and disabled. I tested it on idle system with Intel(R) Core(TM)2
Duo CPU E8400 @ 3.00GHz. With 500 cycles I got these times:

1. original code (without the patch set):

real 18m4.545s 18m11.568s 18m14.396s
user 0m0.004s 0m0.008s 0m0.004s
sys 0m17.176s 0m16.940s 0m16.664s


2. with patchset v7 ; notrace used only for poke_int3_handler; the
slowness is caused by heavy using poke_int3_handler during patching

real 27m11.690s 27m18.176s 27m13.844s
user 0m0.008s 0m0.008s 0m0.008s
sys 0m15.916s 0m15.956s 0m15.860s


3. using text_poke_bp instead of text_poke_bp_list; every address
is patched separately; the slowness is caused by heavy using of
run_sync()

real 34m8.042s 34m15.584s 34m5.008s
user 0m0.004s 0m0.004s 0m0.004s
sys 0m16.296s 0m16.268s 0m16.048s


It means that the slowness caused by tracing "text_poke*" stuff is
comparable with the slowness caused by "run_sync()". It might mean that
"text_poke_bp_list" is not worth the added complexity.

Well, the iterator-based implementation is complex but still faster.
Also the many sync calls might be more painful for a busy system that
heavy use of the int3 handler. So, "text_poke_bp_list" still might be
useful.

Steven Rostedt

unread,
Jan 23, 2014, 11:20:03 AM1/23/14
to
On Thu, 23 Jan 2014 15:21:42 +0100
Petr Mladek <pml...@suse.cz> wrote:

> It means that the slowness caused by tracing "text_poke*" stuff is
> comparable with the slowness caused by "run_sync()". It might mean that
> "text_poke_bp_list" is not worth the added complexity.

I'm starting to think it's fine keeping ftrace and text_poke separate.
Ftrace is rather the NMI of text modification. It's extremely invasive
and has more corner cases than anything else. I rather not complicate
the more generic text_poke just to satisfy ftrace. Perhaps if it was
not such an arch specific change it may be worth it. As these only have
to deal with x86, and the only rational in doing it is to get ftrace to
use text_poke() (one code base for all), I think we can drop it.

Having them separate isn't that bad either, as both get heavy testing
as they both are used in normal development.

>
> Well, the iterator-based implementation is complex but still faster.
> Also the many sync calls might be more painful for a busy system that
> heavy use of the int3 handler. So, "text_poke_bp_list" still might be
> useful.

That said, I'm sure text_poke can use some clean ups. That would be
nice as well. I think batch processing with the list may be more useful
for things like kprobes and jump labels.

-- Steve
0 new messages