And this is called out in Documentation/RCU/checklist.txt, item #2:
2. Do the RCU read-side critical sections make proper use of
rcu_read_lock() and friends? These primitives are needed
to prevent grace periods from ending prematurely, which
could result in data being unceremoniously freed out from
under your read-side code, which can greatly increase the
actuarial risk of your kernel.
As a rough rule of thumb, any dereference of an RCU-protected
pointer must be covered by rcu_read_lock() or rcu_read_lock_bh()
or by the appropriate update-side lock.
> There is another instance of __task_cred() in sys_setpriority() itself
> which is equally unprotected.
>
> Wrap the whole code section into a rcu read side critical section to
> fix this quick and dirty.
>
> Will be revisited in course of the read_lock(&tasklist_lock) -> rcu
> crusade.
Good catch!
Acked-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
> Cc: David Howells <dhow...@redhat.com>
> Cc: James Morris <jmo...@namei.org>
> Cc: linux-secu...@vger.kernel.org
> ---
> kernel/sys.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> Index: linux-2.6-tip/kernel/sys.c
> ===================================================================
> --- linux-2.6-tip.orig/kernel/sys.c
> +++ linux-2.6-tip/kernel/sys.c
> @@ -163,6 +163,7 @@ SYSCALL_DEFINE3(setpriority, int, which,
> if (niceval > 19)
> niceval = 19;
>
> + rcu_read_lock();
> read_lock(&tasklist_lock);
> switch (which) {
> case PRIO_PROCESS:
> @@ -200,6 +201,7 @@ SYSCALL_DEFINE3(setpriority, int, which,
> }
> out_unlock:
> read_unlock(&tasklist_lock);
> + rcu_read_unlock();
> out:
> return error;
> }
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Looks straight forward and correct to me.
Reviewed-by: KOSAKI Motohiro <kosaki....@jp.fujitsu.com>
Does this mean that we need both rcu_read_lock() *and*
read_lock(&tasklist_lock) when calling find_task_by_vpid() because pid_task()
uses rcu_dereference() and find_pid_ns() uses hlist_for_each_entry_rcu ?
378 /*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
380 */
381 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382 {
383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
384 }
385
386 struct task_struct *find_task_by_vpid(pid_t vnr)
387 {
388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
389 }
Acked-by: Paul E. McKenney <pau...@linux.vnet.ibm.com> from RCU perspective.
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
> Cc: James Morris <jmo...@namei.org>
> Cc: linux-secu...@vger.kernel.org
> ---
> security/keys/keyctl.c | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> Index: linux-2.6-tip/security/keys/keyctl.c
> ===================================================================
> --- linux-2.6-tip.orig/security/keys/keyctl.c
> +++ linux-2.6-tip/security/keys/keyctl.c
> @@ -1237,7 +1237,7 @@ long keyctl_get_security(key_serial_t ke
> long keyctl_session_to_parent(void)
> {
> struct task_struct *me, *parent;
> - const struct cred *mycred, *pcred;
> + const struct cred *mycred, *pcred = NULL;
> struct cred *cred, *oldcred;
> key_ref_t keyring_r;
> int ret;
> @@ -1274,7 +1274,7 @@ long keyctl_session_to_parent(void)
> /* the parent and the child must have different session keyrings or
> * there's no point */
> mycred = current_cred();
> - pcred = __task_cred(parent);
> + pcred = get_task_cred(parent);
> if (mycred == pcred ||
> mycred->tgcred->session_keyring == pcred->tgcred->session_keyring)
> goto already_same;
> @@ -1312,6 +1312,7 @@ long keyctl_session_to_parent(void)
> set_ti_thread_flag(task_thread_info(parent), TIF_NOTIFY_RESUME);
>
> write_unlock_irq(&tasklist_lock);
> + put_cred(pcred);
> if (oldcred)
> put_cred(oldcred);
> return 0;
> @@ -1321,6 +1322,8 @@ already_same:
> not_permitted:
> write_unlock_irq(&tasklist_lock);
> put_cred(cred);
> + if (pcred)
> + put_cred(pcred);
> return ret;
>
> error_keyring:
OK, I will bite... Don't the corresponding updates write-hold
tasklist_lock? If so, then the fact that the code below is read-holding
tasklist_lock would prevent any of the data from changing, which would
remove the need to do the rcu_read_lock().
Or are there updates that are carried out without write-holding
tasklist_lock that I am missing?
Thanx, Paul
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
> Cc: David Howells <dhow...@redhat.com>
> Cc: James Morris <jmo...@namei.org>
> Cc: linux-secu...@vger.kernel.org
> ---
> kernel/sys.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> Index: linux-2.6-tip/kernel/sys.c
> ===================================================================
> --- linux-2.6-tip.orig/kernel/sys.c
> +++ linux-2.6-tip/kernel/sys.c
> @@ -163,6 +163,7 @@ SYSCALL_DEFINE3(setpriority, int, which,
> if (niceval > 19)
> niceval = 19;
>
> + rcu_read_lock();
> read_lock(&tasklist_lock);
> switch (which) {
> case PRIO_PROCESS:
> @@ -200,6 +201,7 @@ SYSCALL_DEFINE3(setpriority, int, which,
> }
> out_unlock:
> read_unlock(&tasklist_lock);
> + rcu_read_unlock();
> out:
> return error;
> }
>
>
Off-topic, but can't resist...
This also fixes another bug here. find_task_by_vpid() is not safe
without rcu_read_lock(). I do not mean it is not safe to use the
result, just find_pid_ns() by itself is not safe.
Usually tasklist gives enough protection, but if copy_process() fails
it calls free_pid() lockless and does call_rcu(delayed_put_pid().
This means, without rcu lock find_pid_ns() can't scan the hash table
safely.
Oleg.
I guess we have whole bunch of auditing to do all over the place. And
we definitely need some debugging aid (preferrably a lockdep
extension) which allows us to detect such nasty details.
Thanks,
tglx
Yes, commit_creds() is called lockless.
Or I misunderstood the question/patch...
Oleg.
> On 12/10, Thomas Gleixner wrote:
> >
> > 1) Remove the misleading comment in __sigqueue_alloc() which claims
> > that holding a spinlock is equivalent to rcu_read_lock().
> >
> > 2) Wrap the __send_signal() call in send_signal() into a rcu read side
> > critical section to guarantee that the __sigqueue_alloc()
> > requirement is met in any case.
> > ...
> > static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
> > int group)
> > {
> > - int from_ancestor_ns = 0;
> > + int ret, from_ancestor_ns = 0;
> >
> > #ifdef CONFIG_PID_NS
> > if (!is_si_special(info) && SI_FROMUSER(info) &&
> > @@ -954,7 +953,11 @@ static int send_signal(int sig, struct s
> > from_ancestor_ns = 1;
> > #endif
> >
> > - return __send_signal(sig, info, t, group, from_ancestor_ns);
> > + rcu_read_lock();
> > + ret = __send_signal(sig, info, t, group, from_ancestor_ns);
> > + rcu_read_unlock();
>
> But, without a comment it is very unobvious why do we need rcu_read_lock().
>
> Perhaps it is better to modify __sigqueue_alloc() instead? It can take
> rcu_lock() around cred->user itself.
Indeed. Was too tired to see the obvious :)
Thanks,
tglx
Right, and that's what the problem is. commit_creds(), which rcu frees
the old creds, does not take tasklist lock write lock.
Thanks,
tglx
But, without a comment it is very unobvious why do we need rcu_read_lock().
Perhaps it is better to modify __sigqueue_alloc() instead? It can take
rcu_lock() around cred->user itself.
Oleg.
So, we need to change below comment from "or" to "and" ?
378 /*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
380 */
381 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382 {
383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
384 }
385
386 struct task_struct *find_task_by_vpid(pid_t vnr)
387 {
388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
389 }
Acked-by: Oleg Nesterov <ol...@redhat.com>
> Index: linux-2.6-tip/kernel/signal.c
> ===================================================================
> --- linux-2.6-tip.orig/kernel/signal.c
> +++ linux-2.6-tip/kernel/signal.c
> @@ -1175,11 +1175,12 @@ int kill_pid_info_as_uid(int sig, struct
> int ret = -EINVAL;
> struct task_struct *p;
> const struct cred *pcred;
> + unsigned long flags;
>
> if (!valid_signal(sig))
> return ret;
>
> - read_lock(&tasklist_lock);
> + rcu_read_lock();
> p = pid_task(pid, PIDTYPE_PID);
> if (!p) {
> ret = -ESRCH;
> @@ -1196,14 +1197,16 @@ int kill_pid_info_as_uid(int sig, struct
> ret = security_task_kill(p, info, sig, secid);
> if (ret)
> goto out_unlock;
> - if (sig && p->sighand) {
> - unsigned long flags;
> - spin_lock_irqsave(&p->sighand->siglock, flags);
> - ret = __send_signal(sig, info, p, 1, 0);
> - spin_unlock_irqrestore(&p->sighand->siglock, flags);
> +
> + if (sig) {
> + if (lock_task_sighand(p, &flags)) {
> + ret = __send_signal(sig, info, p, 1, 0);
> + unlock_task_sighand(p, &flags);
> + } else
> + ret = -ESRCH;
> }
> out_unlock:
> - read_unlock(&tasklist_lock);
> + rcu_read_unlock();
> return ret;
> }
> EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
No, both functions must be called with rcu_read_lock()
tasklist_lock read-held is not protecting the rcu lists and does not
protect against a concurrent update. It merily protects against tasks
going away or being added while we look up the lists.
> 378 /*
> 379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
> 380 */
> 381 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
> 382 {
> 383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
> 384 }
> 385
> 386 struct task_struct *find_task_by_vpid(pid_t vnr)
> 387 {
> 388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
> 389 }
>
Thanks,
tglx
sys: Fix missing rcu protection for __task_cred() access
commit c69e8d9 (CRED: Use RCU to access another task's creds and to
release a task's own creds) added non rcu_read_lock() protected access
to task creds of the target task in set_prio_one().
The comment above the function says:
* - the caller must hold the RCU read lock
The calling code in sys_setpriority does read_lock(&tasklist_lock) but
not rcu_read_lock(). This works only when CONFIG_TREE_PREEMPT_RCU=n.
With CONFIG_TREE_PREEMPT_RCU=y the rcu_callbacks can run in the tick
interrupt when they see no read side critical section.
There is another instance of __task_cred() in sys_setpriority() itself
which is equally unprotected.
Wrap the whole code section into a rcu read side critical section to
fix this quick and dirty.
Will be revisited in course of the read_lock(&tasklist_lock) -> rcu
crusade.
Oleg noted further:
This also fixes another bug here. find_task_by_vpid() is not safe
without rcu_read_lock(). I do not mean it is not safe to use the
result, just find_pid_ns() by itself is not safe.
Usually tasklist gives enough protection, but if copy_process() fails
it calls free_pid() lockless and does call_rcu(delayed_put_pid().
This means, without rcu lock find_pid_ns() can't scan the hash table
safely.
Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
LKML-Reference: <200912100047...@linutronix.de>
Acked-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---
kernel/sys.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/kernel/sys.c b/kernel/sys.c
index 9968c5f..bc1dc61 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -163,6 +163,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
if (niceval > 19)
niceval = 19;
+ rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
case PRIO_PROCESS:
@@ -200,6 +201,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
}
out_unlock:
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
out:
return error;
}
signals: Fix more rcu assumptions
1) Remove the misleading comment in __sigqueue_alloc() which claims
that holding a spinlock is equivalent to rcu_read_lock().
2) Add a rcu_read_lock/unlock around the __task_cred() access
in __sigqueue_alloc()
This needs to be revisited to remove the remaining users of
read_lock(&tasklist_lock) but that's outside the scope of this patch.
Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
LKML-Reference: <200912100047...@linutronix.de>
---
kernel/signal.c | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/signal.c b/kernel/signal.c
index 7331656..f67545f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
struct user_struct *user;
/*
- * We won't get problems with the target's UID changing under us
- * because changing it requires RCU be used, and if t != current, the
- * caller must be holding the RCU readlock (by way of a spinlock) and
- * we use RCU protection here
+ * Protect access to @t credentials. This can go away when all
+ * callers hold rcu read lock.
*/
+ rcu_read_lock();
user = get_uid(__task_cred(t)->user);
atomic_inc(&user->sigpending);
+ rcu_read_unlock();
if (override_rlimit ||
atomic_read(&user->sigpending) <=
signal: Fix racy access to __task_cred in kill_pid_info_as_uid()
kill_pid_info_as_uid() accesses __task_cred() without being in a RCU
read side critical section. tasklist_lock is not protecting that when
CONFIG_TREE_PREEMPT_RCU=y.
Convert the whole tasklist_lock section to rcu and use
lock_task_sighand to prevent the exit race.
Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
LKML-Reference: <200912100047...@linutronix.de>
Acked-by: Oleg Nesterov <ol...@redhat.com>
---
kernel/signal.c | 17 ++++++++++-------
1 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/kernel/signal.c b/kernel/signal.c
index 6b982f2..7331656 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1175,11 +1175,12 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
int ret = -EINVAL;
struct task_struct *p;
const struct cred *pcred;
+ unsigned long flags;
if (!valid_signal(sig))
return ret;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = pid_task(pid, PIDTYPE_PID);
if (!p) {
ret = -ESRCH;
@@ -1196,14 +1197,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
ret = security_task_kill(p, info, sig, secid);
if (ret)
goto out_unlock;
- if (sig && p->sighand) {
- unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- ret = __send_signal(sig, info, p, 1, 0);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+
+ if (sig) {
+ if (lock_task_sighand(p, &flags)) {
+ ret = __send_signal(sig, info, p, 1, 0);
+ unlock_task_sighand(p, &flags);
+ } else
+ ret = -ESRCH;
}
out_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
So, rcu_read_lock() is mandatory.
But I couldn't understand why tasklist_lock being held is not mandatory.
Caller of these functions will use "struct task_struct" and expect that values
and pointers retrieved by dereferencing returned pointer are valid.
If "struct task_struct" was removed from the list due to missing tasklist_lock
between returning from find_task_by_pid_ns() and dereferencing, I worry that
values and pointers retrieved by dereferencing are invalid.
rcu_read_lock() being held will prevent the returned "struct task_struct" from
being kfree()d, but I think rcu_read_lock() being held does not prevent values
and pointers of "struct task_struct" from being invalidated.
Anyway, I browsed 2.6.32 for find_task_by_vpid() / find_task_by_pid_ns() users
and found below users which lacks read_lock(&tasklist_lock) or rcu_read_lock().
Users missing read_lock(&tasklist_lock) when calling find_task_by_vpid():
get_net_ns_by_pid() in net/core/net_namespace.c
seq_print_userip_objs() in kernel/trace/trace_output.c
fill_pid() in kernel/taskstats.c
fill_tgid() in kernel/taskstats.c
futex_find_get_task() in kernel/futex.c
SYSCALL_DEFINE3(get_robust_list) in kernel/futex.c
compat_sys_get_robust_list() in kernel/futex_compat.c
ptrace_get_task_struct() in kernel/ptrace.c
do_send_specific() in kernel/signal.c
find_get_context() in kernel/perf_event.c
posix_cpu_clock_get() in kernel/posix-cpu-timers.c
good_sigevent() in kernel/posix-timers.c
attach_task_by_pid() in kernel/cgroup.c
SYSCALL_DEFINE1(getpgid) in kernel/sys.c
SYSCALL_DEFINE1(getsid) in kernel/sys.c
do_sched_setscheduler() in kernel/sched.c
Users missing rcu_read_lock() when calling find_task_by_vpid():
SYSCALL_DEFINE3(ioprio_set) in fs/ioprio.c
SYSCALL_DEFINE2(ioprio_get) in fs/ioprio.c
cap_get_target_pid() in kernel/capability.c
audit_prepare_user_tty() in kernel/audit.c
audit_receive_msg() in kernel/audit.c
check_clock() in kernel/posix-cpu-timers.c
posix_cpu_timer_create() in kernel/posix-cpu-timers.c
SYSCALL_DEFINE3(setpriority) in kernel/sys.c
SYSCALL_DEFINE2(getpriority) in kernel/sys.c
SYSCALL_DEFINE2(setpgid) in kernel/sys.c
SYSCALL_DEFINE1(sched_getscheduler) in kernel/sched.c
SYSCALL_DEFINE2(sched_getparam) in kernel/sched.c
sched_setaffinity() in kernel/sched.c
sched_getaffinity() in kernel/sched.c
SYSCALL_DEFINE2(sched_rr_get_interval) in kernel/sched.c
tomoyo_is_select_one() in security/tomoyo/common.c
tomoyo_read_pid() in security/tomoyo/common.c
SYSCALL_DEFINE6(move_pages) in mm/migrate.c
SYSCALL_DEFINE4(migrate_pages) in mm/mempolicy.c
find_process_by_pid() in arch/mips/kernel/mips-mt-fpaff.c
pfm_get_task() in arch/ia64/kernel/perfmon.c
cxn_pin_by_pid() in arch/frv/mm/mmu-context.c
Users missing read_lock(&tasklist_lock) when calling find_task_by_pid_ns():
rest_init() in init/main.c
proc_pid_lookup() in fs/proc/base.c
proc_task_lookup() in fs/proc/base.c
first_tid() in fs/proc/base.c
getthread() in kernel/kgdb.c
mconsole_stack() in arch/um/drivers/mconsole_kern.c
Users missing rcu_read_lock() when calling find_task_by_pid_ns():
rest_init() in init/main.c
getthread() in kernel/kgdb.c
mconsole_stack() in arch/um/drivers/mconsole_kern.c
Regarding users which lack rcu_read_lock(), users call
read_lock(&tasklist_lock) in order to access "struct task_struct" returned
by find_task_by_pid_ns() but they do not want to be bothered by internal pid
structure. Thus, the fix would be to add rcu_read_lock() and rcu_read_unlock()
inside find_task_by_pid_ns().
But how to check users which lack read_lock(&tasklist_lock) but expecting that
it is safe to access "struct task_struct" returned by find_task_by_pid_ns() ?
> task_sig() accesses __task_cred() without being in a RCU read side
> critical section. tasklist_lock is not protecting that when
> CONFIG_TREE_PREEMPT_RCU=y.
>
> Add a rcu_read_lock/unlock() section around the code which accesses
> __task_cred().
>
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
Acked-by: David Howells <dhow...@redhat.com>
> > > Or are there updates that are carried out without write-holding
> > > tasklist_lock that I am missing?
> >
> > Yes, commit_creds() is called lockless.
>
> Right, and that's what the problem is. commit_creds(), which rcu frees
> the old creds, does not take tasklist lock write lock.
commit_creds() does not need to hold a write lock, because it is implicitly
write-locked by only being permitted to run in the thread to which it is
committing.
I don't think commit_creds() needs to take the RCU read lock as no-one else
can alter/delete the creds it is dealing with.
David
> sys_ioprio_get() accesses __task_cred() without being in a RCU read
> side critical section. tasklist_lock is not protecting that when
> CONFIG_TREE_PREEMPT_RCU=y.
>
> Add a rcu_read_lock/unlock() section around the code which accesses
> __task_cred().
>
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
Acked-by: David Howells <dhow...@redhat.com>
> commit c69e8d9 (CRED: Use RCU to access another task's creds and to
> release a task's own creds) added non rcu_read_lock() protected access
> to task creds of the target task in set_prio_one().
>
> The comment above the function says:
> * - the caller must hold the RCU read lock
>
> The calling code in sys_setpriority does read_lock(&tasklist_lock) but
> not rcu_read_lock(). This works only when CONFIG_TREE_PREEMPT_RCU=n.
> With CONFIG_TREE_PREEMPT_RCU=y the rcu_callbacks can run in the tick
> interrupt when they see no read side critical section.
>
> There is another instance of __task_cred() in sys_setpriority() itself
> which is equally unprotected.
>
> Wrap the whole code section into a rcu read side critical section to
> fix this quick and dirty.
>
> Will be revisited in course of the read_lock(&tasklist_lock) -> rcu
> crusade.
>
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
> Cc: James Morris <jmo...@namei.org>
> Cc: linux-secu...@vger.kernel.org
Acked-by: David Howells <dhow...@redhat.com>
> kill_pid_info_as_uid() accesses __task_cred() without being in a RCU
> read side critical section. tasklist_lock is not protecting that when
> CONFIG_TREE_PREEMPT_RCU=y.
>
> Convert the whole tasklist_lock section to rcu and use
> lock_task_sighand to prevent the exit race.
>
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
Acked-by: David Howells <dhow...@redhat.com>
> + /* Protect __task_cred() access */
> + rcu_read_lock();
> printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
> p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
> get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
> p->comm);
> + rcu_read_unlock();
No. If there's only one access to __task_cred() like this, use
task_cred_xxx() or one of its wrappers instead:
- p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
+ p->pid, task_uid(p), p->tgid, mm->total_vm,
that limits the size of the critical section.
David
> Thomas Gleixner <tg...@linutronix.de> wrote:
>
> > > > Or are there updates that are carried out without write-holding
> > > > tasklist_lock that I am missing?
> > >
> > > Yes, commit_creds() is called lockless.
> >
> > Right, and that's what the problem is. commit_creds(), which rcu frees
> > the old creds, does not take tasklist lock write lock.
>
> commit_creds() does not need to hold a write lock, because it is implicitly
> write-locked by only being permitted to run in the thread to which it is
> committing.
>
> I don't think commit_creds() needs to take the RCU read lock as no-one else
> can alter/delete the creds it is dealing with.
commit_cred() is not required to take anything, but the reader side
needs to take rcu_read_lock() when accessing __task_cred() of another
task as that task could update its own creds right at that point.
The point is that read_lock(tasklist_lock) is not sufficient for the
reader side.
Thanks,
tglx
> Strictly this is not necessary today, but according to Paul McKenney
> it's not guaranteed that irq disabled regions will prevent RCU
> progress. That's a property of the current implementation. In
> preempt-rt this assumption is not true anymore.
>
> Use get_task_cred() to make this future proof.
>
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
Acked-by: David Howells <dhow...@redhat.com>
> Thomas Gleixner <tg...@linutronix.de> wrote:
>
> > + /* Protect __task_cred() access */
> > + rcu_read_lock();
> > printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
> > p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
> > get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
> > p->comm);
> > + rcu_read_unlock();
>
> No. If there's only one access to __task_cred() like this, use
> task_cred_xxx() or one of its wrappers instead:
>
> - p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
> + p->pid, task_uid(p), p->tgid, mm->total_vm,
>
> that limits the size of the critical section.
Fair enough.
tglx
> > Perhaps it is better to modify __sigqueue_alloc() instead? It can take
> > rcu_lock() around cred->user itself.
>
> Indeed. Was too tired to see the obvious :)
Ah, but... If __sigqueue_alloc() is called from sigqueue_alloc(), then you
don't need the RCU read lock as the target task is current.
So perhaps the callsite for __sigqueue_alloc() in __send_signal()? That at
least puts the rcu_read_lock() call in proximity to the function that actually
needs it.
David
> 1) held spinlocks are not equivalent to rcu_read_lock
>
> 2) access to current_cred() is safe as only current can modify its
> own credentials.
>
> Signed-off-by: Thomas Gleixner <tg...@linutronix.de>
Acked-by: David Howells <dhow...@redhat.com>
So, we need to update the comment on these functions?
----------------------------------------
[PATCH] Update comment on find_task_by_pid_ns
Caller of find_task_by_vpid() and find_task_by_pid_ns() needs to call
rcu_read_lock() rather than read_lock(&tasklist_lock) because find_pid_ns()
uses RCU primitives but spinlock does not prevent RCU callback if preemptive
RCU ( CONFIG_TREE_PREEMPT_RCU ) is enabled.
Signed-off-by: Tetsuo Handa <penguin...@I-love.SAKURA.ne.jp>
---
kernel/pid.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- linux-next.orig/kernel/pid.c
+++ linux-next/kernel/pid.c
@@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid
EXPORT_SYMBOL(pid_task);
/*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ * Must be called under rcu_read_lock().
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
I agree with the patch, but the changelog looks a bit confusing to me.
Perhaps this is just me, though.
tasklist does protect the task and its pid, it can't go away. The problem
is that find_pid_ns() itself is unsafe without rcu lock, it can race with
copy_process()->free_pid(any_pid).
IOW, if we change copy_process()
--- kernel/fork.c
+++ kernel/fork.c
@@ -1304,8 +1304,11 @@ static struct task_struct *copy_process(
return p;
bad_fork_free_pid:
- if (pid != &init_struct_pid)
+ if (pid != &init_struct_pid) {
+ read_lock(&tasklist_lock);
free_pid(pid);
+ read_unlock(&tasklist_lock);
+ }
bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
then find_task_by_pid_ns/etc could be used under tasklist safely even
with PREEMPT_RCU.
In any case, I think the patch is nice.
Oleg.
We try to get rid of the read_lock sites of tasklist_lock, so please
let's not think about adding more :)
Thanks,
tglx
Yes, yes, I agree. I didn't mean this patch makes sense.
Oleg.
As of 2.6.32 , below users are missing rcu_read_lock().
Users missing rcu_read_lock() when calling find_task_by_vpid():
SYSCALL_DEFINE3(ioprio_set) in fs/ioprio.c
SYSCALL_DEFINE2(ioprio_get) in fs/ioprio.c
cap_get_target_pid() in kernel/capability.c
audit_prepare_user_tty() in kernel/audit.c
audit_receive_msg() in kernel/audit.c
check_clock() in kernel/posix-cpu-timers.c
posix_cpu_timer_create() in kernel/posix-cpu-timers.c
SYSCALL_DEFINE3(setpriority) in kernel/sys.c
SYSCALL_DEFINE2(getpriority) in kernel/sys.c
SYSCALL_DEFINE2(setpgid) in kernel/sys.c
SYSCALL_DEFINE1(sched_getscheduler) in kernel/sched.c
SYSCALL_DEFINE2(sched_getparam) in kernel/sched.c
sched_setaffinity() in kernel/sched.c
sched_getaffinity() in kernel/sched.c
SYSCALL_DEFINE2(sched_rr_get_interval) in kernel/sched.c
tomoyo_is_select_one() in security/tomoyo/common.c
tomoyo_read_pid() in security/tomoyo/common.c
SYSCALL_DEFINE6(move_pages) in mm/migrate.c
SYSCALL_DEFINE4(migrate_pages) in mm/mempolicy.c
find_process_by_pid() in arch/mips/kernel/mips-mt-fpaff.c
pfm_get_task() in arch/ia64/kernel/perfmon.c
cxn_pin_by_pid() in arch/frv/mm/mmu-context.c
Users missing rcu_read_lock() when calling find_task_by_pid_ns():
rest_init() in init/main.c
getthread() in kernel/kgdb.c
mconsole_stack() in arch/um/drivers/mconsole_kern.c
What should we do? Adding rcu_read_lock()/rcu_read_unlock() to each
callers? Or adding rcu_read_lock()/rcu_read_unlock() inside
find_task_by_pid_ns()?
--------------------
[PATCH] Update comment on find_task_by_pid_ns
tasklist does protect the task and its pid, it can't go away. The problem
is that find_pid_ns() itself is unsafe without rcu lock, it can race with
copy_process()->free_pid(any_pid).
Protecting copy_process()->free_pid(any_pid) with tasklist_lock would make it
possible to call find_task_by_pid_ns() under tasklist safely, but we don't do
so because we are trying to get rid of the read_lock sites of tasklist_lock.
Signed-off-by: Tetsuo Handa <penguin...@I-love.SAKURA.ne.jp>
---
kernel/pid.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- linux-next.orig/kernel/pid.c
+++ linux-next/kernel/pid.c
@@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid
EXPORT_SYMBOL(pid_task);
/*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ * Must be called under rcu_read_lock().
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
> OK. I updated description.
>
> As of 2.6.32 , below users are missing rcu_read_lock().
>
> Users missing rcu_read_lock() when calling find_task_by_vpid():
>
> SYSCALL_DEFINE3(ioprio_set) in fs/ioprio.c
> SYSCALL_DEFINE2(ioprio_get) in fs/ioprio.c
> cap_get_target_pid() in kernel/capability.c
Actually, cap_get_target_pid() uses rcu_read_lock() and doesn't take
tasklist_lock.
Putting rcu_read_lock() in the callee isn't a complete solution.
Because the function would still be returning a task_struct* without
any locking held and without taking a reference against it. So that
pointer is useless to the caller!
We could add a new function which looks up the task and then takes a
reference on it, insde suitable locks. The caller would then use the
task_struct and then remember to call put_task_struct() to unpin it.
This prevents the task_struct from getting freed while it's being
manipulated, but it doesn't prevent fields within it from being altered
- that's up to the caller to sort out.
One fix is to go through all those callsites and add the rcu_read_lock.
That kinda sucks. Perhaps writing the new function which returns a
pinned task_struct is better?
Hmm - is that in -mm? In my copy here it takes read_lock(&tasklist_lock)
And I'll admit I'm a bit confused as to the current state of things:
do I understand correctly that we now need to take both the tasklist_lock
and rcu_read_lock? (Presumably only for read_lock()?)
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in
> Quoting Andrew Morton (ak...@linux-foundation.org):
> > On Tue, 9 Feb 2010 06:42:45 +0900
> > Tetsuo Handa <penguin...@I-love.SAKURA.ne.jp> wrote:
> >
> > > OK. I updated description.
> > >
> > > As of 2.6.32 , below users are missing rcu_read_lock().
> > >
> > > Users missing rcu_read_lock() when calling find_task_by_vpid():
> > >
> > > SYSCALL_DEFINE3(ioprio_set) in fs/ioprio.c
> > > SYSCALL_DEFINE2(ioprio_get) in fs/ioprio.c
> > > cap_get_target_pid() in kernel/capability.c
> >
> > Actually, cap_get_target_pid() uses rcu_read_lock() and doesn't take
> > tasklist_lock.
>
> Hmm - is that in -mm? In my copy here it takes read_lock(&tasklist_lock)
yup. It got changed in linux-next.
> And I'll admit I'm a bit confused as to the current state of things:
> do I understand correctly that we now need to take both the tasklist_lock
> and rcu_read_lock? (Presumably only for read_lock()?)
Beats me. We need to protect both the pid->task_struct lookup data
structures (during the lookup) and protect the resulting task_struct
while the caller is playing with it. It's unclear whether
rcu_read_lock() suffices for both purposes.
The rcu_read_lock section is sufficient. task_struct can not go away
before the rcu_read_unlock()
Thanks,
tglx
But, more generally, it used to be the case that a spinlock (or
rwlock) would imply an rcu read cycle, but now it no longer does,
do I understand that right?
thanks,
-serge
You are correct, with CONFIG_PREEMPT_RCU, acquiring a lock does -not-
imply an RCU read-side critical section. And acquiring a lock does
-not- imply any sort of RCU read-side critical section in -rt kernels
in any case.
Thanx, Paul
Code for "struct task_struct" is too complicated for me to understand,
but my understanding is that
(1) tasklist_lock is acquired for writing.
(2) "struct task_struct" (to exit()) is removed from task's list.
(3) tasklist_lock is released.
(4) Wait for RCU grace period.
(5) kfree() members of "struct task_struct".
(6) kfree() "struct task_struct" itself.
If above sequence is correct, I think
rcu_read_lock();
task = find_task_by_pid_ns();
if (task)
do_something(task);
rcu_read_unlock();
do_something() can safely access all members of task without
read_lock(&tasklist_lock), except task->prev (I don't know the exact member)
and task->usage, because do_something() finishes its work before (5).
I think we need to call find_task_by_pid_ns() with both
read_lock(&tasklist_lock) and rcu_read_lock()
read_lock(&tasklist_lock);
rcu_read_lock();
task = find_task_by_pid_ns();
if (task)
atomido_something(task);
rcu_read_unlock();
read_unlock(&tasklist_lock);
only when do_something() wants to access task->prev or task->usage .
This bug for sys_setpriority() was fixed, but not fixed for sys_getpriority().
Why not to add it as well?
--------------------
[PATCH] sys: Fix missing rcu protection for sys_setpriority.
find_task_by_vpid() is not safe without rcu_read_lock().
2.6.33-rc7 got RCU protection for sys_setpriority() but missed it for
sys_getpriority().
Signed-off-by: Tetsuo Handa <penguin...@I-love.SAKURA.ne.jp>
---
kernel/sys.c | 2 ++
1 file changed, 2 insertions(+)
--- linux-2.6.33-rc7.orig/kernel/sys.c
+++ linux-2.6.33-rc7/kernel/sys.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which,
if (which > PRIO_USER || which < PRIO_PROCESS)
return -EINVAL;
+ rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
case PRIO_PROCESS:
@@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which,
}
out_unlock:
read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return retval;
Would be needed indeed but I don't have a copy of linux-next handy -
if this isn't changed there yet then
Acked-by: Serge Hallyn <se...@us.ibm.com>
thanks,
-serge
> ---
> kernel/sys.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> --- linux-2.6.33-rc7.orig/kernel/sys.c
> +++ linux-2.6.33-rc7/kernel/sys.c
> @@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which,
> if (which > PRIO_USER || which < PRIO_PROCESS)
> return -EINVAL;
>
> + rcu_read_lock();
> read_lock(&tasklist_lock);
> switch (which) {
> case PRIO_PROCESS:
> @@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which,
> }
> out_unlock:
> read_unlock(&tasklist_lock);
> + rcu_read_unlock();
>
> return retval;
> }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-security-module" in