tracing fixes for v6.7-rc7:

- Fix readers that are blocked on the ring buffer when buffer_percent is
   100%. They are supposed to wake up when the buffer is full, but
   because the sub-buffer that the writer is on is never considered
   "dirty" in the calculation, dirty pages will never equal nr_pages.
   Add +1 to the dirty count in order to count for the sub-buffer that
   the writer is on.
 
 - When a reader is blocked on the "snapshot_raw" file, it is to be
   woken up when a snapshot is done and be able to read the snapshot
   buffer. But because the snapshot swaps the buffers (the main one
   with the snapshot one), and the snapshot reader is waiting on the
   old snapshot buffer, it was not woken up (because it is now on
   the main buffer after the swap). Worse yet, when it reads the buffer
   after a snapshot, it's not reading the snapshot buffer, it's reading
   the live active main buffer.
 
   Fix this by forcing a wakeup of all readers on the snapshot buffer when
   a new snapshot happens, and then update the buffer that the reader
   is reading to be back on the snapshot buffer.
 
 - Fix the modification of the direct_function hash. There was a race
   when new functions were added to the direct_function hash as when
   it moved function entries from the old hash to the new one, a direct
   function trace could be hit and not see its entry.
 
   This is fixed by allocating the new hash, copy all the old entries
   onto it as well as the new entries, and then use rcu_assign_pointer()
   to update the new direct_function hash with it.
 
   This also fixes a memory leak in that code.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYIADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCZZAzTxQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qs9IAP9e6wZ74aEjMED9nsbC49EpyCNTqa72
 y0uDS/p9ppv52gD7Be+l+kJQzYNh6bZU0+B19hNC2QVn38jb7sOadfO/1Q8=
 =NDkf
 -----END PGP SIGNATURE-----

Merge tag 'trace-v6.7-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull tracing fixes from Steven Rostedt:

 - Fix readers that are blocked on the ring buffer when buffer_percent
   is 100%. They are supposed to wake up when the buffer is full, but
   because the sub-buffer that the writer is on is never considered
   "dirty" in the calculation, dirty pages will never equal nr_pages.
   Add +1 to the dirty count in order to count for the sub-buffer that
   the writer is on.

 - When a reader is blocked on the "snapshot_raw" file, it is to be
   woken up when a snapshot is done and be able to read the snapshot
   buffer. But because the snapshot swaps the buffers (the main one with
   the snapshot one), and the snapshot reader is waiting on the old
   snapshot buffer, it was not woken up (because it is now on the main
   buffer after the swap). Worse yet, when it reads the buffer after a
   snapshot, it's not reading the snapshot buffer, it's reading the live
   active main buffer.

   Fix this by forcing a wakeup of all readers on the snapshot buffer
   when a new snapshot happens, and then update the buffer that the
   reader is reading to be back on the snapshot buffer.

 - Fix the modification of the direct_function hash. There was a race
   when new functions were added to the direct_function hash as when it
   moved function entries from the old hash to the new one, a direct
   function trace could be hit and not see its entry.

   This is fixed by allocating the new hash, copy all the old entries
   onto it as well as the new entries, and then use rcu_assign_pointer()
   to update the new direct_function hash with it.

   This also fixes a memory leak in that code.

 - Fix eventfs ownership

* tag 'trace-v6.7-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  ftrace: Fix modification of direct_function hash while in use
  tracing: Fix blocked reader of snapshot buffer
  ring-buffer: Fix wake ups when buffer_percent is set to 100
  eventfs: Fix file and directory uid and gid ownership
This commit is contained in:
Linus Torvalds 2023-12-30 11:37:35 -08:00
commit 453f5db061
6 changed files with 176 additions and 69 deletions

View File

@ -113,7 +113,14 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
* determined by the parent directory.
*/
if (dentry->d_inode->i_mode & S_IFDIR) {
update_attr(&ei->attr, iattr);
/*
* The events directory dentry is never freed, unless its
* part of an instance that is deleted. It's attr is the
* default for its child files and directories.
* Do not update it. It's not used for its own mode or ownership
*/
if (!ei->is_events)
update_attr(&ei->attr, iattr);
} else {
name = dentry->d_name.name;
@ -148,28 +155,93 @@ static const struct file_operations eventfs_file_operations = {
.release = eventfs_release,
};
/* Return the evenfs_inode of the "events" directory */
static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
{
struct eventfs_inode *ei;
mutex_lock(&eventfs_mutex);
do {
/* The parent always has an ei, except for events itself */
ei = dentry->d_parent->d_fsdata;
/*
* If the ei is being freed, the ownership of the children
* doesn't matter.
*/
if (ei->is_freed) {
ei = NULL;
break;
}
dentry = ei->dentry;
} while (!ei->is_events);
mutex_unlock(&eventfs_mutex);
return ei;
}
static void update_inode_attr(struct dentry *dentry, struct inode *inode,
struct eventfs_attr *attr, umode_t mode)
{
if (!attr) {
inode->i_mode = mode;
struct eventfs_inode *events_ei = eventfs_find_events(dentry);
if (!events_ei)
return;
inode->i_mode = mode;
inode->i_uid = events_ei->attr.uid;
inode->i_gid = events_ei->attr.gid;
if (!attr)
return;
}
if (attr->mode & EVENTFS_SAVE_MODE)
inode->i_mode = attr->mode & EVENTFS_MODE_MASK;
else
inode->i_mode = mode;
if (attr->mode & EVENTFS_SAVE_UID)
inode->i_uid = attr->uid;
else
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
if (attr->mode & EVENTFS_SAVE_GID)
inode->i_gid = attr->gid;
else
inode->i_gid = d_inode(dentry->d_parent)->i_gid;
}
static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
{
struct eventfs_inode *ei_child;
/* at most we have events/system/event */
if (WARN_ON_ONCE(level > 3))
return;
ei->attr.gid = gid;
if (ei->entry_attrs) {
for (int i = 0; i < ei->nr_entries; i++) {
ei->entry_attrs[i].gid = gid;
}
}
/*
* Only eventfs_inode with dentries are updated, make sure
* all eventfs_inodes are updated. If one of the children
* do not have a dentry, this function must traverse it.
*/
list_for_each_entry_srcu(ei_child, &ei->children, list,
srcu_read_lock_held(&eventfs_srcu)) {
if (!ei_child->dentry)
update_gid(ei_child, gid, level + 1);
}
}
void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
{
struct eventfs_inode *ei = dentry->d_fsdata;
int idx;
idx = srcu_read_lock(&eventfs_srcu);
update_gid(ei, gid, 0);
srcu_read_unlock(&eventfs_srcu, idx);
}
/**
@ -860,6 +932,8 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
struct eventfs_inode *ei;
struct tracefs_inode *ti;
struct inode *inode;
kuid_t uid;
kgid_t gid;
if (security_locked_down(LOCKDOWN_TRACEFS))
return NULL;
@ -884,11 +958,20 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
ei->dentry = dentry;
ei->entries = entries;
ei->nr_entries = size;
ei->is_events = 1;
ei->data = data;
ei->name = kstrdup_const(name, GFP_KERNEL);
if (!ei->name)
goto fail;
/* Save the ownership of this directory */
uid = d_inode(dentry->d_parent)->i_uid;
gid = d_inode(dentry->d_parent)->i_gid;
/* This is used as the default ownership of the files and directories */
ei->attr.uid = uid;
ei->attr.gid = gid;
INIT_LIST_HEAD(&ei->children);
INIT_LIST_HEAD(&ei->list);
@ -897,6 +980,8 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
ti->private = ei;
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
inode->i_uid = uid;
inode->i_gid = gid;
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;

View File

@ -210,6 +210,7 @@ repeat:
next = this_parent->d_subdirs.next;
resume:
while (next != &this_parent->d_subdirs) {
struct tracefs_inode *ti;
struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
next = tmp->next;
@ -218,6 +219,11 @@ resume:
change_gid(dentry, gid);
/* If this is the events directory, update that too */
ti = get_tracefs(dentry->d_inode);
if (ti && (ti->flags & TRACEFS_EVENT_INODE))
eventfs_update_gid(dentry, gid);
if (!list_empty(&dentry->d_subdirs)) {
spin_unlock(&this_parent->d_lock);
spin_release(&dentry->d_lock.dep_map, _RET_IP_);

View File

@ -62,6 +62,7 @@ struct eventfs_inode {
struct rcu_head rcu;
};
unsigned int is_freed:1;
unsigned int is_events:1;
unsigned int nr_entries:31;
};
@ -77,6 +78,7 @@ struct inode *tracefs_get_inode(struct super_block *sb);
struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
struct dentry *eventfs_failed_creating(struct dentry *dentry);
struct dentry *eventfs_end_creating(struct dentry *dentry);
void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */

View File

@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
static struct ftrace_func_entry *
add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
{
struct ftrace_func_entry *entry;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return -ENOMEM;
return NULL;
entry->ip = ip;
__add_hash_entry(hash, entry);
return 0;
return entry;
}
static void
@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
int size;
int ret;
int i;
new_hash = alloc_ftrace_hash(size_bits);
@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
ret = add_hash_entry(new_hash, entry->ip);
if (ret < 0)
if (add_hash_entry(new_hash, entry->ip) == NULL)
goto free_hash;
}
}
@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
static struct ftrace_hash *direct_functions = EMPTY_HASH;
static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
int ftrace_direct_func_count;
@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
return entry->direct;
}
static struct ftrace_func_entry*
ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
struct ftrace_hash **free_hash)
{
struct ftrace_func_entry *entry;
if (ftrace_hash_empty(direct_functions) ||
direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
struct ftrace_hash *new_hash;
int size = ftrace_hash_empty(direct_functions) ? 0 :
direct_functions->count + 1;
if (size < 32)
size = 32;
new_hash = dup_hash(direct_functions, size);
if (!new_hash)
return NULL;
*free_hash = direct_functions;
direct_functions = new_hash;
}
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return NULL;
entry->ip = ip;
entry->direct = addr;
__add_hash_entry(direct_functions, entry);
return entry;
}
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
/* Do nothing if it exists */
if (entry)
return 0;
ret = add_hash_entry(hash, rec->ip);
if (add_hash_entry(hash, rec->ip) == NULL)
ret = -ENOMEM;
}
return ret;
}
@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return 0;
}
return add_hash_entry(hash, ip);
entry = add_hash_entry(hash, ip);
return entry ? 0 : -ENOMEM;
}
static int
@ -5410,7 +5377,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
*/
int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash, *free_hash = NULL;
struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
struct ftrace_func_entry *entry, *new;
int err = -EBUSY, size, i;
@ -5436,17 +5403,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
}
}
/* ... and insert them to direct_functions hash. */
err = -ENOMEM;
/* Make a copy hash to place the new and the old entries in */
size = hash->count + direct_functions->count;
if (size > 32)
size = 32;
new_hash = alloc_ftrace_hash(fls(size));
if (!new_hash)
goto out_unlock;
/* Now copy over the existing direct entries */
size = 1 << direct_functions->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
new = add_hash_entry(new_hash, entry->ip);
if (!new)
goto out_unlock;
new->direct = entry->direct;
}
}
/* ... and add the new entries */
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
new = add_hash_entry(new_hash, entry->ip);
if (!new)
goto out_remove;
goto out_unlock;
/* Update both the copy and the hash entry */
new->direct = addr;
entry->direct = addr;
}
}
free_hash = direct_functions;
rcu_assign_pointer(direct_functions, new_hash);
new_hash = NULL;
ops->func = call_direct_funcs;
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
@ -5454,17 +5448,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
err = register_ftrace_function_nolock(ops);
out_remove:
if (err)
remove_direct_functions_hash(hash, addr);
out_unlock:
mutex_unlock(&direct_mutex);
if (free_hash) {
if (free_hash && free_hash != EMPTY_HASH) {
synchronize_rcu_tasks();
free_ftrace_hash(free_hash);
}
if (new_hash)
free_ftrace_hash(new_hash);
return err;
}
EXPORT_SYMBOL_GPL(register_ftrace_direct);
@ -6309,7 +6303,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
if (add_hash_entry(hash, rec->ip) < 0)
if (add_hash_entry(hash, rec->ip) == NULL)
goto out;
} else {
if (entry) {

View File

@ -881,9 +881,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f
if (!nr_pages || !full)
return true;
dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
/*
* Add one as dirty will never equal nr_pages, as the sub-buffer
* that the writer is on is not counted as dirty.
* This is needed if "buffer_percent" is set to 100.
*/
dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1;
return (dirty * 100) > (full * nr_pages);
return (dirty * 100) >= (full * nr_pages);
}
/*
@ -944,7 +949,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
/* make sure the waiters see the new index */
smp_wmb();
rb_wake_up_waiters(&rbwork->work);
/* This can be called in any context */
irq_work_queue(&rbwork->work);
}
/**

View File

@ -1894,6 +1894,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
/* Any waiters on the old snapshot buffer need to wake up */
ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
}
/**
@ -1945,12 +1948,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
int ret;
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
full);
ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
#ifdef CONFIG_TRACER_MAX_TRACE
/*
* Make sure this is still the snapshot buffer, as if a snapshot were
* to happen, this would now be the main buffer.
*/
if (iter->snapshot)
iter->array_buffer = &iter->tr->max_buffer;
#endif
return ret;
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@ -8517,7 +8531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
wait_index = READ_ONCE(iter->wait_index);
ret = wait_on_pipe(iter, iter->tr->buffer_percent);
ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;