Add benchmark to measure the throughput and latency of the bpf_loop
call.
Testing this on my dev machine on 1 thread, the data is as follows:
nr_loops: 10
bpf_loop - throughput: 198.519 ± 0.155 M ops/s, latency: 5.037 ns/op
nr_loops: 100
bpf_loop - throughput: 247.448 ± 0.305 M ops/s, latency: 4.041 ns/op
nr_loops: 500
bpf_loop - throughput: 260.839 ± 0.380 M ops/s, latency: 3.834 ns/op
nr_loops: 1000
bpf_loop - throughput: 262.806 ± 0.629 M ops/s, latency: 3.805 ns/op
nr_loops: 5000
bpf_loop - throughput: 264.211 ± 1.508 M ops/s, latency: 3.785 ns/op
nr_loops: 10000
bpf_loop - throughput: 265.366 ± 3.054 M ops/s, latency: 3.768 ns/op
nr_loops: 50000
bpf_loop - throughput: 235.986 ± 20.205 M ops/s, latency: 4.238 ns/op
nr_loops: 100000
bpf_loop - throughput: 264.482 ± 0.279 M ops/s, latency: 3.781 ns/op
nr_loops: 500000
bpf_loop - throughput: 309.773 ± 87.713 M ops/s, latency: 3.228 ns/op
nr_loops: 1000000
bpf_loop - throughput: 262.818 ± 4.143 M ops/s, latency: 3.805 ns/op
>From this data, we can see that the latency per loop decreases as the
number of loops increases. On this particular machine, each loop had an
overhead of about ~4 ns, and we were able to run ~250 million loops
per second.
Signed-off-by: Joanne Koong <joannekoong@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20211130030622.4131246-5-joannekoong@fb.com
87 lines
1.9 KiB
C
87 lines
1.9 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#pragma once
|
|
#include <stdlib.h>
|
|
#include <stdbool.h>
|
|
#include <linux/err.h>
|
|
#include <errno.h>
|
|
#include <unistd.h>
|
|
#include <bpf/bpf.h>
|
|
#include <bpf/libbpf.h>
|
|
#include <math.h>
|
|
#include <time.h>
|
|
#include <sys/syscall.h>
|
|
|
|
struct cpu_set {
|
|
bool *cpus;
|
|
int cpus_len;
|
|
int next_cpu;
|
|
};
|
|
|
|
struct env {
|
|
char *bench_name;
|
|
int duration_sec;
|
|
int warmup_sec;
|
|
bool verbose;
|
|
bool list;
|
|
bool affinity;
|
|
int consumer_cnt;
|
|
int producer_cnt;
|
|
struct cpu_set prod_cpus;
|
|
struct cpu_set cons_cpus;
|
|
};
|
|
|
|
struct bench_res {
|
|
long hits;
|
|
long drops;
|
|
long false_hits;
|
|
};
|
|
|
|
struct bench {
|
|
const char *name;
|
|
void (*validate)();
|
|
void (*setup)();
|
|
void *(*producer_thread)(void *ctx);
|
|
void *(*consumer_thread)(void *ctx);
|
|
void (*measure)(struct bench_res* res);
|
|
void (*report_progress)(int iter, struct bench_res* res, long delta_ns);
|
|
void (*report_final)(struct bench_res res[], int res_cnt);
|
|
};
|
|
|
|
struct counter {
|
|
long value;
|
|
} __attribute__((aligned(128)));
|
|
|
|
extern struct env env;
|
|
extern const struct bench *bench;
|
|
|
|
void setup_libbpf();
|
|
void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
|
|
void hits_drops_report_final(struct bench_res res[], int res_cnt);
|
|
void false_hits_report_progress(int iter, struct bench_res *res, long delta_ns);
|
|
void false_hits_report_final(struct bench_res res[], int res_cnt);
|
|
void ops_report_progress(int iter, struct bench_res *res, long delta_ns);
|
|
void ops_report_final(struct bench_res res[], int res_cnt);
|
|
|
|
static inline __u64 get_time_ns() {
|
|
struct timespec t;
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &t);
|
|
|
|
return (u64)t.tv_sec * 1000000000 + t.tv_nsec;
|
|
}
|
|
|
|
static inline void atomic_inc(long *value)
|
|
{
|
|
(void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED);
|
|
}
|
|
|
|
static inline void atomic_add(long *value, long n)
|
|
{
|
|
(void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED);
|
|
}
|
|
|
|
static inline long atomic_swap(long *value, long n)
|
|
{
|
|
return __atomic_exchange_n(value, n, __ATOMIC_RELAXED);
|
|
}
|