bpf: Enable task local storage for tracing programs
To access per-task data, BPF programs usually creates a hash table with
pid as the key. This is not ideal because:
1. The user need to estimate the proper size of the hash table, which may
be inaccurate;
2. Big hash tables are slow;
3. To clean up the data properly during task terminations, the user need
to write extra logic.
Task local storage overcomes these issues and offers a better option for
these per-task data. Task local storage is only available to BPF_LSM. Now
enable it for tracing programs.
Unlike LSM programs, tracing programs can be called in IRQ contexts.
Helpers that access task local storage are updated to use
raw_spin_lock_irqsave() instead of raw_spin_lock_bh().
Tracing programs can attach to functions on the task free path, e.g.
exit_creds(). To avoid allocating task local storage after
bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate
new storage when the task is not refcounted (task->usage == 0).
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210225234319.336131-2-songliubraving@fb.com
This commit is contained in:
committed by
Alexei Starovoitov
parent
9c8f21e6f8
commit
a10787e6d5
@@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
|
||||
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
|
||||
obj-${CONFIG_BPF_LSM} += bpf_task_storage.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
|
||||
obj-$(CONFIG_BPF_JIT) += trampoline.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += btf.o
|
||||
@@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
|
||||
ifeq ($(CONFIG_NET),y)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += offload.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
|
||||
endif
|
||||
|
||||
@@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
bool free_local_storage = false;
|
||||
unsigned long flags;
|
||||
|
||||
if (unlikely(!selem_linked_to_storage(selem)))
|
||||
/* selem has already been unlinked from sk */
|
||||
return;
|
||||
|
||||
local_storage = rcu_dereference(selem->local_storage);
|
||||
raw_spin_lock_bh(&local_storage->lock);
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (likely(selem_linked_to_storage(selem)))
|
||||
free_local_storage = bpf_selem_unlink_storage_nolock(
|
||||
local_storage, selem, true);
|
||||
raw_spin_unlock_bh(&local_storage->lock);
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
|
||||
if (free_local_storage)
|
||||
kfree_rcu(local_storage, rcu);
|
||||
@@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage_map *smap;
|
||||
struct bpf_local_storage_map_bucket *b;
|
||||
unsigned long flags;
|
||||
|
||||
if (unlikely(!selem_linked_to_map(selem)))
|
||||
/* selem has already be unlinked from smap */
|
||||
@@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
|
||||
|
||||
smap = rcu_dereference(SDATA(selem)->smap);
|
||||
b = select_bucket(smap, selem);
|
||||
raw_spin_lock_bh(&b->lock);
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
if (likely(selem_linked_to_map(selem)))
|
||||
hlist_del_init_rcu(&selem->map_node);
|
||||
raw_spin_unlock_bh(&b->lock);
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
}
|
||||
|
||||
void bpf_selem_link_map(struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage_elem *selem)
|
||||
{
|
||||
struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_bh(&b->lock);
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
|
||||
hlist_add_head_rcu(&selem->map_node, &b->list);
|
||||
raw_spin_unlock_bh(&b->lock);
|
||||
raw_spin_unlock_irqrestore(&b->lock, flags);
|
||||
}
|
||||
|
||||
void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
|
||||
@@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
|
||||
|
||||
sdata = SDATA(selem);
|
||||
if (cacheit_lockit) {
|
||||
unsigned long flags;
|
||||
|
||||
/* spinlock is needed to avoid racing with the
|
||||
* parallel delete. Otherwise, publishing an already
|
||||
* deleted sdata to the cache will become a use-after-free
|
||||
* problem in the next bpf_local_storage_lookup().
|
||||
*/
|
||||
raw_spin_lock_bh(&local_storage->lock);
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (selem_linked_to_storage(selem))
|
||||
rcu_assign_pointer(local_storage->cache[smap->cache_idx],
|
||||
sdata);
|
||||
raw_spin_unlock_bh(&local_storage->lock);
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
}
|
||||
|
||||
return sdata;
|
||||
@@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
|
||||
struct bpf_local_storage_data *old_sdata = NULL;
|
||||
struct bpf_local_storage_elem *selem;
|
||||
struct bpf_local_storage *local_storage;
|
||||
unsigned long flags;
|
||||
int err;
|
||||
|
||||
/* BPF_EXIST and BPF_NOEXIST cannot be both set */
|
||||
@@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
|
||||
}
|
||||
}
|
||||
|
||||
raw_spin_lock_bh(&local_storage->lock);
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
|
||||
/* Recheck local_storage->list under local_storage->lock */
|
||||
if (unlikely(hlist_empty(&local_storage->list))) {
|
||||
@@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
|
||||
}
|
||||
|
||||
unlock:
|
||||
raw_spin_unlock_bh(&local_storage->lock);
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
return SDATA(selem);
|
||||
|
||||
unlock_err:
|
||||
raw_spin_unlock_bh(&local_storage->lock);
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
|
||||
@@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
return &bpf_spin_lock_proto;
|
||||
case BPF_FUNC_spin_unlock:
|
||||
return &bpf_spin_unlock_proto;
|
||||
case BPF_FUNC_task_storage_get:
|
||||
return &bpf_task_storage_get_proto;
|
||||
case BPF_FUNC_task_storage_delete:
|
||||
return &bpf_task_storage_delete_proto;
|
||||
case BPF_FUNC_bprm_opts_set:
|
||||
return &bpf_bprm_opts_set_proto;
|
||||
case BPF_FUNC_ima_inode_hash:
|
||||
|
||||
@@ -15,7 +15,6 @@
|
||||
#include <linux/bpf_local_storage.h>
|
||||
#include <linux/filter.h>
|
||||
#include <uapi/linux/btf.h>
|
||||
#include <linux/bpf_lsm.h>
|
||||
#include <linux/btf_ids.h>
|
||||
#include <linux/fdtable.h>
|
||||
|
||||
@@ -24,12 +23,8 @@ DEFINE_BPF_STORAGE_CACHE(task_cache);
|
||||
static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
|
||||
{
|
||||
struct task_struct *task = owner;
|
||||
struct bpf_storage_blob *bsb;
|
||||
|
||||
bsb = bpf_task(task);
|
||||
if (!bsb)
|
||||
return NULL;
|
||||
return &bsb->storage;
|
||||
return &task->bpf_storage;
|
||||
}
|
||||
|
||||
static struct bpf_local_storage_data *
|
||||
@@ -38,13 +33,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
|
||||
{
|
||||
struct bpf_local_storage *task_storage;
|
||||
struct bpf_local_storage_map *smap;
|
||||
struct bpf_storage_blob *bsb;
|
||||
|
||||
bsb = bpf_task(task);
|
||||
if (!bsb)
|
||||
return NULL;
|
||||
|
||||
task_storage = rcu_dereference(bsb->storage);
|
||||
task_storage = rcu_dereference(task->bpf_storage);
|
||||
if (!task_storage)
|
||||
return NULL;
|
||||
|
||||
@@ -57,16 +47,12 @@ void bpf_task_storage_free(struct task_struct *task)
|
||||
struct bpf_local_storage_elem *selem;
|
||||
struct bpf_local_storage *local_storage;
|
||||
bool free_task_storage = false;
|
||||
struct bpf_storage_blob *bsb;
|
||||
struct hlist_node *n;
|
||||
|
||||
bsb = bpf_task(task);
|
||||
if (!bsb)
|
||||
return;
|
||||
unsigned long flags;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
local_storage = rcu_dereference(bsb->storage);
|
||||
local_storage = rcu_dereference(task->bpf_storage);
|
||||
if (!local_storage) {
|
||||
rcu_read_unlock();
|
||||
return;
|
||||
@@ -81,7 +67,7 @@ void bpf_task_storage_free(struct task_struct *task)
|
||||
* when unlinking elem from the local_storage->list and
|
||||
* the map's bucket->list.
|
||||
*/
|
||||
raw_spin_lock_bh(&local_storage->lock);
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
|
||||
/* Always unlink from map before unlinking from
|
||||
* local_storage.
|
||||
@@ -90,7 +76,7 @@ void bpf_task_storage_free(struct task_struct *task)
|
||||
free_task_storage = bpf_selem_unlink_storage_nolock(
|
||||
local_storage, selem, false);
|
||||
}
|
||||
raw_spin_unlock_bh(&local_storage->lock);
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
rcu_read_unlock();
|
||||
|
||||
/* free_task_storage should always be true as long as
|
||||
@@ -150,7 +136,7 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
|
||||
*/
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
task = pid_task(pid, PIDTYPE_PID);
|
||||
if (!task || !task_storage_ptr(task)) {
|
||||
if (!task) {
|
||||
err = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
@@ -213,23 +199,16 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
|
||||
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
|
||||
return (unsigned long)NULL;
|
||||
|
||||
/* explicitly check that the task_storage_ptr is not
|
||||
* NULL as task_storage_lookup returns NULL in this case and
|
||||
* bpf_local_storage_update expects the owner to have a
|
||||
* valid storage pointer.
|
||||
*/
|
||||
if (!task || !task_storage_ptr(task))
|
||||
if (!task)
|
||||
return (unsigned long)NULL;
|
||||
|
||||
sdata = task_storage_lookup(task, map, true);
|
||||
if (sdata)
|
||||
return (unsigned long)sdata->data;
|
||||
|
||||
/* This helper must only be called from places where the lifetime of the task
|
||||
* is guaranteed. Either by being refcounted or by being protected
|
||||
* by an RCU read-side critical section.
|
||||
*/
|
||||
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
|
||||
/* only allocate new storage, when the task is refcounted */
|
||||
if (refcount_read(&task->usage) &&
|
||||
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
|
||||
sdata = bpf_local_storage_update(
|
||||
task, (struct bpf_local_storage_map *)map, value,
|
||||
BPF_NOEXIST);
|
||||
|
||||
Reference in New Issue
Block a user