rcuperf: Add kfree_rcu() performance Tests

This test runs kfree_rcu() in a loop to measure performance of the new
kfree_rcu() batching functionality.

The following table shows results when booting with arguments:
rcuperf.kfree_loops=20000 rcuperf.kfree_alloc_num=8000
rcuperf.kfree_rcu_test=1 rcuperf.kfree_no_batch=X

rcuperf.kfree_no_batch=X    # Grace Periods	Test Duration (s)
  X=1 (old behavior)              9133                 11.5
  X=0 (new behavior)              1732                 12.5

On a 16 CPU system with the above boot parameters, we see that the total
number of grace periods that elapse during the test drops from 9133 when
not batching to 1732 when batching (a 5X improvement). The kfree_rcu()
flood itself slows down a bit when batching, though, as shown.

Note that the active memory consumption during the kfree_rcu() flood
does increase to around 200-250MB due to the batching (from around 50MB
without batching). However, this memory consumption is relatively
constant. In other words, the system is able to keep up with the
kfree_rcu() load. The memory consumption comes down considerably if
KFREE_DRAIN_JIFFIES is increased from HZ/50 to HZ/80. A later patch will
reduce memory consumption further by using multiple lists.

Also, when running the test, please disable CONFIG_DEBUG_PREEMPT and
CONFIG_PROVE_RCU for realistic comparisons with/without batching.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
This commit is contained in:
Joel Fernandes (Google) 2019-08-30 12:36:29 -04:00 committed by Paul E. McKenney
parent a35d16905e
commit e6e78b004f
2 changed files with 190 additions and 8 deletions

View File

@ -3978,6 +3978,23 @@
test until boot completes in order to avoid test until boot completes in order to avoid
interference. interference.
rcuperf.kfree_rcu_test= [KNL]
Set to measure performance of kfree_rcu() flooding.
rcuperf.kfree_nthreads= [KNL]
The number of threads running loops of kfree_rcu().
rcuperf.kfree_alloc_num= [KNL]
Number of allocations and frees done in an iteration.
rcuperf.kfree_loops= [KNL]
Number of loops doing rcuperf.kfree_alloc_num number
of allocations and frees.
rcuperf.kfree_no_batch= [KNL]
Use the non-batching (less efficient) version of kfree_rcu().
This is useful for comparing with the batched version.
rcuperf.nreaders= [KNL] rcuperf.nreaders= [KNL]
Set number of RCU readers. The value -1 selects Set number of RCU readers. The value -1 selects
N, where N is the number of CPUs. A value N, where N is the number of CPUs. A value

View File

@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
"Shutdown at end of performance tests."); "Shutdown at end of performance tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
static char *perf_type = "rcu"; static char *perf_type = "rcu";
module_param(perf_type, charp, 0444); module_param(perf_type, charp, 0444);
@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished;
static wait_queue_head_t shutdown_wq; static wait_queue_head_t shutdown_wq;
static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_started;
static u64 t_rcu_perf_writer_finished; static u64 t_rcu_perf_writer_finished;
static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_gp_test_started;
static unsigned long b_rcu_perf_writer_finished; static unsigned long b_rcu_gp_test_finished;
static DEFINE_PER_CPU(atomic_t, n_async_inflight); static DEFINE_PER_CPU(atomic_t, n_async_inflight);
#define MAX_MEAS 10000 #define MAX_MEAS 10000
@ -378,10 +379,10 @@ rcu_perf_writer(void *arg)
if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t; t_rcu_perf_writer_started = t;
if (gp_exp) { if (gp_exp) {
b_rcu_perf_writer_started = b_rcu_gp_test_started =
cur_ops->exp_completed() / 2; cur_ops->exp_completed() / 2;
} else { } else {
b_rcu_perf_writer_started = cur_ops->get_gp_seq(); b_rcu_gp_test_started = cur_ops->get_gp_seq();
} }
} }
@ -429,10 +430,10 @@ retry:
PERFOUT_STRING("Test complete"); PERFOUT_STRING("Test complete");
t_rcu_perf_writer_finished = t; t_rcu_perf_writer_finished = t;
if (gp_exp) { if (gp_exp) {
b_rcu_perf_writer_finished = b_rcu_gp_test_finished =
cur_ops->exp_completed() / 2; cur_ops->exp_completed() / 2;
} else { } else {
b_rcu_perf_writer_finished = b_rcu_gp_test_finished =
cur_ops->get_gp_seq(); cur_ops->get_gp_seq();
} }
if (shutdown) { if (shutdown) {
@ -515,8 +516,8 @@ rcu_perf_cleanup(void)
t_rcu_perf_writer_finished - t_rcu_perf_writer_finished -
t_rcu_perf_writer_started, t_rcu_perf_writer_started,
ngps, ngps,
rcuperf_seq_diff(b_rcu_perf_writer_finished, rcuperf_seq_diff(b_rcu_gp_test_finished,
b_rcu_perf_writer_started)); b_rcu_gp_test_started));
for (i = 0; i < nrealwriters; i++) { for (i = 0; i < nrealwriters; i++) {
if (!writer_durations) if (!writer_durations)
break; break;
@ -584,6 +585,167 @@ rcu_perf_shutdown(void *arg)
return -EINVAL; return -EINVAL;
} }
/*
* kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number
* of iterations and measure total time and number of GP for all iterations to complete.
*/
torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu().");
static struct task_struct **kfree_reader_tasks;
static int kfree_nrealthreads;
static atomic_t n_kfree_perf_thread_started;
static atomic_t n_kfree_perf_thread_ended;
struct kfree_obj {
char kfree_obj[8];
struct rcu_head rh;
};
static int
kfree_perf_thread(void *arg)
{
int i, loop = 0;
long me = (long)arg;
struct kfree_obj *alloc_ptr;
u64 start_time, end_time;
VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
set_user_nice(current, MAX_NICE);
start_time = ktime_get_mono_fast_ns();
if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) {
if (gp_exp)
b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
else
b_rcu_gp_test_started = cur_ops->get_gp_seq();
}
do {
for (i = 0; i < kfree_alloc_num; i++) {
alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
if (!alloc_ptr)
return -ENOMEM;
if (!kfree_no_batch) {
kfree_rcu(alloc_ptr, rh);
} else {
rcu_callback_t cb;
cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh);
kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb);
}
}
cond_resched();
} while (!torture_must_stop() && ++loop < kfree_loops);
if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
end_time = ktime_get_mono_fast_ns();
if (gp_exp)
b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
else
b_rcu_gp_test_finished = cur_ops->get_gp_seq();
pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n",
(unsigned long long)(end_time - start_time), kfree_loops,
rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started));
if (shutdown) {
smp_mb(); /* Assign before wake. */
wake_up(&shutdown_wq);
}
}
torture_kthread_stopping("kfree_perf_thread");
return 0;
}
static void
kfree_perf_cleanup(void)
{
int i;
if (torture_cleanup_begin())
return;
if (kfree_reader_tasks) {
for (i = 0; i < kfree_nrealthreads; i++)
torture_stop_kthread(kfree_perf_thread,
kfree_reader_tasks[i]);
kfree(kfree_reader_tasks);
}
torture_cleanup_end();
}
/*
* shutdown kthread. Just waits to be awakened, then shuts down system.
*/
static int
kfree_perf_shutdown(void *arg)
{
do {
wait_event(shutdown_wq,
atomic_read(&n_kfree_perf_thread_ended) >=
kfree_nrealthreads);
} while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
smp_mb(); /* Wake before output. */
kfree_perf_cleanup();
kernel_power_off();
return -EINVAL;
}
static int __init
kfree_perf_init(void)
{
long i;
int firsterr = 0;
kfree_nrealthreads = compute_real(kfree_nthreads);
/* Start up the kthreads. */
if (shutdown) {
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
shutdown_task);
if (firsterr)
goto unwind;
schedule_timeout_uninterruptible(1);
}
kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
GFP_KERNEL);
if (kfree_reader_tasks == NULL) {
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < kfree_nrealthreads; i++) {
firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
kfree_reader_tasks[i]);
if (firsterr)
goto unwind;
}
while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
schedule_timeout_uninterruptible(1);
torture_init_end();
return 0;
unwind:
torture_init_end();
kfree_perf_cleanup();
return firsterr;
}
static int __init static int __init
rcu_perf_init(void) rcu_perf_init(void)
{ {
@ -616,6 +778,9 @@ rcu_perf_init(void)
if (cur_ops->init) if (cur_ops->init)
cur_ops->init(); cur_ops->init();
if (kfree_rcu_test)
return kfree_perf_init();
nrealwriters = compute_real(nwriters); nrealwriters = compute_real(nwriters);
nrealreaders = compute_real(nreaders); nrealreaders = compute_real(nreaders);
atomic_set(&n_rcu_perf_reader_started, 0); atomic_set(&n_rcu_perf_reader_started, 0);