mirror of
https://github.com/torvalds/linux.git
synced 2024-12-30 23:02:08 +00:00
a7e52ad7ed
Chunyu Hu reported:
"per_cpu trace directories and files are created for all possible cpus,
but only the cpus which have ever been on-lined have their own per cpu
ring buffer (allocated by cpuhp threads). While trace_buffers_open, the
open handler for trace file 'trace_pipe_raw' is always trying to access
field of ring_buffer_per_cpu, and would panic with the NULL pointer.
Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV
when openning it if that cpu does not have trace ring buffer.
Reproduce:
cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw
(cpu31 is never on-lined, this is a 16 cores x86_64 box)
Tested with:
1) boot with maxcpus=14, read trace_pipe_raw of cpu15.
Got -NODEV.
2) oneline cpu15, read trace_pipe_raw of cpu15.
Get the raw trace data.
Call trace:
[ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0
[ 5760.961678] tracing_buffers_read+0x1f6/0x230
[ 5760.962695] __vfs_read+0x37/0x160
[ 5760.963498] ? __vfs_read+0x5/0x160
[ 5760.964339] ? security_file_permission+0x9d/0xc0
[ 5760.965451] ? __vfs_read+0x5/0x160
[ 5760.966280] vfs_read+0x8c/0x130
[ 5760.967070] SyS_read+0x55/0xc0
[ 5760.967779] do_syscall_64+0x67/0x150
[ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25"
This was introduced by the addition of the feature to reuse reader pages
instead of re-allocating them. The problem is that the allocation of a
reader page (which is per cpu) does not check if the cpu is online and set
up for the ring buffer.
Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com
Cc: stable@vger.kernel.org
Fixes: 73a757e631
("ring-buffer: Return reader page back into existing ring buffer")
Reported-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
499 lines
11 KiB
C
499 lines
11 KiB
C
/*
|
|
* ring buffer tester and benchmark
|
|
*
|
|
* Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
|
|
*/
|
|
#include <linux/ring_buffer.h>
|
|
#include <linux/completion.h>
|
|
#include <linux/kthread.h>
|
|
#include <uapi/linux/sched/types.h>
|
|
#include <linux/module.h>
|
|
#include <linux/ktime.h>
|
|
#include <asm/local.h>
|
|
|
|
struct rb_page {
|
|
u64 ts;
|
|
local_t commit;
|
|
char data[4080];
|
|
};
|
|
|
|
/* run time and sleep time in seconds */
|
|
#define RUN_TIME 10ULL
|
|
#define SLEEP_TIME 10
|
|
|
|
/* number of events for writer to wake up the reader */
|
|
static int wakeup_interval = 100;
|
|
|
|
static int reader_finish;
|
|
static DECLARE_COMPLETION(read_start);
|
|
static DECLARE_COMPLETION(read_done);
|
|
|
|
static struct ring_buffer *buffer;
|
|
static struct task_struct *producer;
|
|
static struct task_struct *consumer;
|
|
static unsigned long read;
|
|
|
|
static unsigned int disable_reader;
|
|
module_param(disable_reader, uint, 0644);
|
|
MODULE_PARM_DESC(disable_reader, "only run producer");
|
|
|
|
static unsigned int write_iteration = 50;
|
|
module_param(write_iteration, uint, 0644);
|
|
MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
|
|
|
|
static int producer_nice = MAX_NICE;
|
|
static int consumer_nice = MAX_NICE;
|
|
|
|
static int producer_fifo = -1;
|
|
static int consumer_fifo = -1;
|
|
|
|
module_param(producer_nice, int, 0644);
|
|
MODULE_PARM_DESC(producer_nice, "nice prio for producer");
|
|
|
|
module_param(consumer_nice, int, 0644);
|
|
MODULE_PARM_DESC(consumer_nice, "nice prio for consumer");
|
|
|
|
module_param(producer_fifo, int, 0644);
|
|
MODULE_PARM_DESC(producer_fifo, "fifo prio for producer");
|
|
|
|
module_param(consumer_fifo, int, 0644);
|
|
MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
|
|
|
|
static int read_events;
|
|
|
|
static int test_error;
|
|
|
|
#define TEST_ERROR() \
|
|
do { \
|
|
if (!test_error) { \
|
|
test_error = 1; \
|
|
WARN_ON(1); \
|
|
} \
|
|
} while (0)
|
|
|
|
enum event_status {
|
|
EVENT_FOUND,
|
|
EVENT_DROPPED,
|
|
};
|
|
|
|
static bool break_test(void)
|
|
{
|
|
return test_error || kthread_should_stop();
|
|
}
|
|
|
|
static enum event_status read_event(int cpu)
|
|
{
|
|
struct ring_buffer_event *event;
|
|
int *entry;
|
|
u64 ts;
|
|
|
|
event = ring_buffer_consume(buffer, cpu, &ts, NULL);
|
|
if (!event)
|
|
return EVENT_DROPPED;
|
|
|
|
entry = ring_buffer_event_data(event);
|
|
if (*entry != cpu) {
|
|
TEST_ERROR();
|
|
return EVENT_DROPPED;
|
|
}
|
|
|
|
read++;
|
|
return EVENT_FOUND;
|
|
}
|
|
|
|
static enum event_status read_page(int cpu)
|
|
{
|
|
struct ring_buffer_event *event;
|
|
struct rb_page *rpage;
|
|
unsigned long commit;
|
|
void *bpage;
|
|
int *entry;
|
|
int ret;
|
|
int inc;
|
|
int i;
|
|
|
|
bpage = ring_buffer_alloc_read_page(buffer, cpu);
|
|
if (IS_ERR(bpage))
|
|
return EVENT_DROPPED;
|
|
|
|
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
|
|
if (ret >= 0) {
|
|
rpage = bpage;
|
|
/* The commit may have missed event flags set, clear them */
|
|
commit = local_read(&rpage->commit) & 0xfffff;
|
|
for (i = 0; i < commit && !test_error ; i += inc) {
|
|
|
|
if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
|
|
TEST_ERROR();
|
|
break;
|
|
}
|
|
|
|
inc = -1;
|
|
event = (void *)&rpage->data[i];
|
|
switch (event->type_len) {
|
|
case RINGBUF_TYPE_PADDING:
|
|
/* failed writes may be discarded events */
|
|
if (!event->time_delta)
|
|
TEST_ERROR();
|
|
inc = event->array[0] + 4;
|
|
break;
|
|
case RINGBUF_TYPE_TIME_EXTEND:
|
|
inc = 8;
|
|
break;
|
|
case 0:
|
|
entry = ring_buffer_event_data(event);
|
|
if (*entry != cpu) {
|
|
TEST_ERROR();
|
|
break;
|
|
}
|
|
read++;
|
|
if (!event->array[0]) {
|
|
TEST_ERROR();
|
|
break;
|
|
}
|
|
inc = event->array[0] + 4;
|
|
break;
|
|
default:
|
|
entry = ring_buffer_event_data(event);
|
|
if (*entry != cpu) {
|
|
TEST_ERROR();
|
|
break;
|
|
}
|
|
read++;
|
|
inc = ((event->type_len + 1) * 4);
|
|
}
|
|
if (test_error)
|
|
break;
|
|
|
|
if (inc <= 0) {
|
|
TEST_ERROR();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
ring_buffer_free_read_page(buffer, cpu, bpage);
|
|
|
|
if (ret < 0)
|
|
return EVENT_DROPPED;
|
|
return EVENT_FOUND;
|
|
}
|
|
|
|
static void ring_buffer_consumer(void)
|
|
{
|
|
/* toggle between reading pages and events */
|
|
read_events ^= 1;
|
|
|
|
read = 0;
|
|
/*
|
|
* Continue running until the producer specifically asks to stop
|
|
* and is ready for the completion.
|
|
*/
|
|
while (!READ_ONCE(reader_finish)) {
|
|
int found = 1;
|
|
|
|
while (found && !test_error) {
|
|
int cpu;
|
|
|
|
found = 0;
|
|
for_each_online_cpu(cpu) {
|
|
enum event_status stat;
|
|
|
|
if (read_events)
|
|
stat = read_event(cpu);
|
|
else
|
|
stat = read_page(cpu);
|
|
|
|
if (test_error)
|
|
break;
|
|
|
|
if (stat == EVENT_FOUND)
|
|
found = 1;
|
|
|
|
}
|
|
}
|
|
|
|
/* Wait till the producer wakes us up when there is more data
|
|
* available or when the producer wants us to finish reading.
|
|
*/
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
if (reader_finish)
|
|
break;
|
|
|
|
schedule();
|
|
}
|
|
__set_current_state(TASK_RUNNING);
|
|
reader_finish = 0;
|
|
complete(&read_done);
|
|
}
|
|
|
|
static void ring_buffer_producer(void)
|
|
{
|
|
ktime_t start_time, end_time, timeout;
|
|
unsigned long long time;
|
|
unsigned long long entries;
|
|
unsigned long long overruns;
|
|
unsigned long missed = 0;
|
|
unsigned long hit = 0;
|
|
unsigned long avg;
|
|
int cnt = 0;
|
|
|
|
/*
|
|
* Hammer the buffer for 10 secs (this may
|
|
* make the system stall)
|
|
*/
|
|
trace_printk("Starting ring buffer hammer\n");
|
|
start_time = ktime_get();
|
|
timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC);
|
|
do {
|
|
struct ring_buffer_event *event;
|
|
int *entry;
|
|
int i;
|
|
|
|
for (i = 0; i < write_iteration; i++) {
|
|
event = ring_buffer_lock_reserve(buffer, 10);
|
|
if (!event) {
|
|
missed++;
|
|
} else {
|
|
hit++;
|
|
entry = ring_buffer_event_data(event);
|
|
*entry = smp_processor_id();
|
|
ring_buffer_unlock_commit(buffer, event);
|
|
}
|
|
}
|
|
end_time = ktime_get();
|
|
|
|
cnt++;
|
|
if (consumer && !(cnt % wakeup_interval))
|
|
wake_up_process(consumer);
|
|
|
|
#ifndef CONFIG_PREEMPT
|
|
/*
|
|
* If we are a non preempt kernel, the 10 second run will
|
|
* stop everything while it runs. Instead, we will call
|
|
* cond_resched and also add any time that was lost by a
|
|
* rescedule.
|
|
*
|
|
* Do a cond resched at the same frequency we would wake up
|
|
* the reader.
|
|
*/
|
|
if (cnt % wakeup_interval)
|
|
cond_resched();
|
|
#endif
|
|
} while (ktime_before(end_time, timeout) && !break_test());
|
|
trace_printk("End ring buffer hammer\n");
|
|
|
|
if (consumer) {
|
|
/* Init both completions here to avoid races */
|
|
init_completion(&read_start);
|
|
init_completion(&read_done);
|
|
/* the completions must be visible before the finish var */
|
|
smp_wmb();
|
|
reader_finish = 1;
|
|
wake_up_process(consumer);
|
|
wait_for_completion(&read_done);
|
|
}
|
|
|
|
time = ktime_us_delta(end_time, start_time);
|
|
|
|
entries = ring_buffer_entries(buffer);
|
|
overruns = ring_buffer_overruns(buffer);
|
|
|
|
if (test_error)
|
|
trace_printk("ERROR!\n");
|
|
|
|
if (!disable_reader) {
|
|
if (consumer_fifo < 0)
|
|
trace_printk("Running Consumer at nice: %d\n",
|
|
consumer_nice);
|
|
else
|
|
trace_printk("Running Consumer at SCHED_FIFO %d\n",
|
|
consumer_fifo);
|
|
}
|
|
if (producer_fifo < 0)
|
|
trace_printk("Running Producer at nice: %d\n",
|
|
producer_nice);
|
|
else
|
|
trace_printk("Running Producer at SCHED_FIFO %d\n",
|
|
producer_fifo);
|
|
|
|
/* Let the user know that the test is running at low priority */
|
|
if (producer_fifo < 0 && consumer_fifo < 0 &&
|
|
producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
|
|
trace_printk("WARNING!!! This test is running at lowest priority.\n");
|
|
|
|
trace_printk("Time: %lld (usecs)\n", time);
|
|
trace_printk("Overruns: %lld\n", overruns);
|
|
if (disable_reader)
|
|
trace_printk("Read: (reader disabled)\n");
|
|
else
|
|
trace_printk("Read: %ld (by %s)\n", read,
|
|
read_events ? "events" : "pages");
|
|
trace_printk("Entries: %lld\n", entries);
|
|
trace_printk("Total: %lld\n", entries + overruns + read);
|
|
trace_printk("Missed: %ld\n", missed);
|
|
trace_printk("Hit: %ld\n", hit);
|
|
|
|
/* Convert time from usecs to millisecs */
|
|
do_div(time, USEC_PER_MSEC);
|
|
if (time)
|
|
hit /= (long)time;
|
|
else
|
|
trace_printk("TIME IS ZERO??\n");
|
|
|
|
trace_printk("Entries per millisec: %ld\n", hit);
|
|
|
|
if (hit) {
|
|
/* Calculate the average time in nanosecs */
|
|
avg = NSEC_PER_MSEC / hit;
|
|
trace_printk("%ld ns per entry\n", avg);
|
|
}
|
|
|
|
if (missed) {
|
|
if (time)
|
|
missed /= (long)time;
|
|
|
|
trace_printk("Total iterations per millisec: %ld\n",
|
|
hit + missed);
|
|
|
|
/* it is possible that hit + missed will overflow and be zero */
|
|
if (!(hit + missed)) {
|
|
trace_printk("hit + missed overflowed and totalled zero!\n");
|
|
hit--; /* make it non zero */
|
|
}
|
|
|
|
/* Caculate the average time in nanosecs */
|
|
avg = NSEC_PER_MSEC / (hit + missed);
|
|
trace_printk("%ld ns per entry\n", avg);
|
|
}
|
|
}
|
|
|
|
static void wait_to_die(void)
|
|
{
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
while (!kthread_should_stop()) {
|
|
schedule();
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
}
|
|
__set_current_state(TASK_RUNNING);
|
|
}
|
|
|
|
static int ring_buffer_consumer_thread(void *arg)
|
|
{
|
|
while (!break_test()) {
|
|
complete(&read_start);
|
|
|
|
ring_buffer_consumer();
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
if (break_test())
|
|
break;
|
|
schedule();
|
|
}
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
if (!kthread_should_stop())
|
|
wait_to_die();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ring_buffer_producer_thread(void *arg)
|
|
{
|
|
while (!break_test()) {
|
|
ring_buffer_reset(buffer);
|
|
|
|
if (consumer) {
|
|
wake_up_process(consumer);
|
|
wait_for_completion(&read_start);
|
|
}
|
|
|
|
ring_buffer_producer();
|
|
if (break_test())
|
|
goto out_kill;
|
|
|
|
trace_printk("Sleeping for 10 secs\n");
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
if (break_test())
|
|
goto out_kill;
|
|
schedule_timeout(HZ * SLEEP_TIME);
|
|
}
|
|
|
|
out_kill:
|
|
__set_current_state(TASK_RUNNING);
|
|
if (!kthread_should_stop())
|
|
wait_to_die();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __init ring_buffer_benchmark_init(void)
|
|
{
|
|
int ret;
|
|
|
|
/* make a one meg buffer in overwite mode */
|
|
buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
|
|
if (!buffer)
|
|
return -ENOMEM;
|
|
|
|
if (!disable_reader) {
|
|
consumer = kthread_create(ring_buffer_consumer_thread,
|
|
NULL, "rb_consumer");
|
|
ret = PTR_ERR(consumer);
|
|
if (IS_ERR(consumer))
|
|
goto out_fail;
|
|
}
|
|
|
|
producer = kthread_run(ring_buffer_producer_thread,
|
|
NULL, "rb_producer");
|
|
ret = PTR_ERR(producer);
|
|
|
|
if (IS_ERR(producer))
|
|
goto out_kill;
|
|
|
|
/*
|
|
* Run them as low-prio background tasks by default:
|
|
*/
|
|
if (!disable_reader) {
|
|
if (consumer_fifo >= 0) {
|
|
struct sched_param param = {
|
|
.sched_priority = consumer_fifo
|
|
};
|
|
sched_setscheduler(consumer, SCHED_FIFO, ¶m);
|
|
} else
|
|
set_user_nice(consumer, consumer_nice);
|
|
}
|
|
|
|
if (producer_fifo >= 0) {
|
|
struct sched_param param = {
|
|
.sched_priority = producer_fifo
|
|
};
|
|
sched_setscheduler(producer, SCHED_FIFO, ¶m);
|
|
} else
|
|
set_user_nice(producer, producer_nice);
|
|
|
|
return 0;
|
|
|
|
out_kill:
|
|
if (consumer)
|
|
kthread_stop(consumer);
|
|
|
|
out_fail:
|
|
ring_buffer_free(buffer);
|
|
return ret;
|
|
}
|
|
|
|
static void __exit ring_buffer_benchmark_exit(void)
|
|
{
|
|
kthread_stop(producer);
|
|
if (consumer)
|
|
kthread_stop(consumer);
|
|
ring_buffer_free(buffer);
|
|
}
|
|
|
|
module_init(ring_buffer_benchmark_init);
|
|
module_exit(ring_buffer_benchmark_exit);
|
|
|
|
MODULE_AUTHOR("Steven Rostedt");
|
|
MODULE_DESCRIPTION("ring_buffer_benchmark");
|
|
MODULE_LICENSE("GPL");
|