linux/drivers/md/bcache/stats.c
Coly Li 5bebf7486d bcache: fix memory corruption in bch_cache_accounting_clear()
Commit 83ff9318c4 ("bcache: not use hard coded memset size in
bch_cache_accounting_clear()") tries to make the code more easy to
understand by removing the hard coded number with following change,
	void bch_cache_accounting_clear(...)
	{
		memset(&acc->total.cache_hits,
			0,
	-		sizeof(unsigned long) * 7);
	+		sizeof(struct cache_stats));
	}

Unfortunately the change was wrong (it also tells us the original code
was not easy to correctly understand). The hard coded number 7 is used
because in struct cache_stats,
 15 struct cache_stats {
 16         struct kobject          kobj;
 17
 18         unsigned long cache_hits;
 19         unsigned long cache_misses;
 20         unsigned long cache_bypass_hits;
 21         unsigned long cache_bypass_misses;
 22         unsigned long cache_readaheads;
 23         unsigned long cache_miss_collisions;
 24         unsigned long sectors_bypassed;
 25
 26         unsigned int            rescale;
 27 };
only members in LINE 18-24 want to be set to 0. It is wrong to use
'sizeof(struct cache_stats)' to replace 'sizeof(unsigned long) * 7), the
memory objects behind acc->total is staled by this change.

Сорокин Артем Сергеевич reports that by the following steps, kernel
panic will be triggered,
1. Create new set: make-bcache -B /dev/nvme1n1 -C /dev/sda --wipe-bcache
2. Run in /sys/fs/bcache/<uuid>:
   echo 1 > clear_stats && cat stats_five_minute/cache_bypass_hits

I can reproduce the panic and get following dmesg with KASAN enabled,
[22613.172742] ==================================================================
[22613.172862] BUG: KASAN: null-ptr-deref in sysfs_kf_seq_show+0x117/0x230
[22613.172864] Read of size 8 at addr 0000000000000000 by task cat/6753

[22613.172870] CPU: 1 PID: 6753 Comm: cat Not tainted 5.5.0-rc7-lp151.28.16-default+ #11
[22613.172872] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/29/2019
[22613.172873] Call Trace:
[22613.172964]  dump_stack+0x8b/0xbb
[22613.172968]  ? sysfs_kf_seq_show+0x117/0x230
[22613.172970]  ? sysfs_kf_seq_show+0x117/0x230
[22613.173031]  __kasan_report+0x176/0x192
[22613.173064]  ? pr_cont_kernfs_name+0x40/0x60
[22613.173067]  ? sysfs_kf_seq_show+0x117/0x230
[22613.173070]  kasan_report+0xe/0x20
[22613.173072]  sysfs_kf_seq_show+0x117/0x230
[22613.173105]  seq_read+0x199/0x6d0
[22613.173110]  vfs_read+0xa5/0x1a0
[22613.173113]  ksys_read+0x110/0x160
[22613.173115]  ? kernel_write+0xb0/0xb0
[22613.173177]  do_syscall_64+0x77/0x290
[22613.173238]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[22613.173241] RIP: 0033:0x7fc2c886ac61
[22613.173244] Code: fe ff ff 48 8d 3d c7 a0 09 00 48 83 ec 08 e8 46 03 02 00 66 0f 1f 44 00 00 8b 05 ca fb 2c 00 48 63 ff 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 55 53 48 89 d5 48 89
[22613.173245] RSP: 002b:00007ffebe776d68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[22613.173248] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2c886ac61
[22613.173249] RDX: 0000000000020000 RSI: 00007fc2c8cca000 RDI: 0000000000000003
[22613.173250] RBP: 0000000000020000 R08: ffffffffffffffff R09: 0000000000000000
[22613.173251] R10: 000000000000038c R11: 0000000000000246 R12: 00007fc2c8cca000
[22613.173253] R13: 0000000000000003 R14: 00007fc2c8cca00f R15: 0000000000020000
[22613.173255] ==================================================================
[22613.173256] Disabling lock debugging due to kernel taint
[22613.173350] BUG: kernel NULL pointer dereference, address: 0000000000000000
[22613.178380] #PF: supervisor read access in kernel mode
[22613.180959] #PF: error_code(0x0000) - not-present page
[22613.183444] PGD 0 P4D 0
[22613.184867] Oops: 0000 [#1] SMP KASAN PTI
[22613.186797] CPU: 1 PID: 6753 Comm: cat Tainted: G    B             5.5.0-rc7-lp151.28.16-default+ #11
[22613.191253] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/29/2019
[22613.196706] RIP: 0010:sysfs_kf_seq_show+0x117/0x230
[22613.199097] Code: ff 48 8b 0b 48 8b 44 24 08 48 01 e9 eb a6 31 f6 48 89 cf ba 00 10 00 00 48 89 4c 24 10 e8 b1 e6 e9 ff 4c 89 ff e8 19 07 ea ff <49> 8b 07 48 85 c0 48 89 44 24 08 0f 84 91 00 00 00 49 8b 6d 00 48
[22613.208016] RSP: 0018:ffff8881d4f8fd78 EFLAGS: 00010246
[22613.210448] RAX: 0000000000000000 RBX: ffff8881eb99b180 RCX: ffffffff810d9ef6
[22613.213691] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246
[22613.216893] RBP: 0000000000001000 R08: fffffbfff072ddcd R09: fffffbfff072ddcd
[22613.220075] R10: 0000000000000001 R11: fffffbfff072ddcc R12: ffff8881de5c0200
[22613.223256] R13: ffff8881ed175500 R14: ffff8881eb99b198 R15: 0000000000000000
[22613.226290] FS:  00007fc2c8d3d500(0000) GS:ffff8881f2a80000(0000) knlGS:0000000000000000
[22613.229637] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[22613.231993] CR2: 0000000000000000 CR3: 00000001ec89a004 CR4: 00000000003606e0
[22613.234909] Call Trace:
[22613.235931]  seq_read+0x199/0x6d0
[22613.237259]  vfs_read+0xa5/0x1a0
[22613.239229]  ksys_read+0x110/0x160
[22613.240590]  ? kernel_write+0xb0/0xb0
[22613.242040]  do_syscall_64+0x77/0x290
[22613.243625]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[22613.245450] RIP: 0033:0x7fc2c886ac61
[22613.246706] Code: fe ff ff 48 8d 3d c7 a0 09 00 48 83 ec 08 e8 46 03 02 00 66 0f 1f 44 00 00 8b 05 ca fb 2c 00 48 63 ff 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 55 53 48 89 d5 48 89
[22613.253296] RSP: 002b:00007ffebe776d68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[22613.255835] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2c886ac61
[22613.258472] RDX: 0000000000020000 RSI: 00007fc2c8cca000 RDI: 0000000000000003
[22613.260807] RBP: 0000000000020000 R08: ffffffffffffffff R09: 0000000000000000
[22613.263188] R10: 000000000000038c R11: 0000000000000246 R12: 00007fc2c8cca000
[22613.265598] R13: 0000000000000003 R14: 00007fc2c8cca00f R15: 0000000000020000
[22613.268729] Modules linked in: scsi_transport_iscsi af_packet iscsi_ibft iscsi_boot_sysfs vmw_vsock_vmci_transport vsock fuse bnep kvm_intel kvm irqbypass crc32_pclmul crc32c_intel ghash_clmulni_intel snd_ens1371 snd_ac97_codec ac97_bus bcache snd_pcm btusb btrtl btbcm btintel crc64 aesni_intel glue_helper crypto_simd vmw_balloon cryptd bluetooth snd_timer snd_rawmidi snd joydev pcspkr e1000 rfkill vmw_vmci soundcore ecdh_generic ecc gameport i2c_piix4 mptctl ac button hid_generic usbhid sr_mod cdrom ata_generic ehci_pci vmwgfx uhci_hcd drm_kms_helper syscopyarea serio_raw sysfillrect sysimgblt fb_sys_fops ttm ehci_hcd mptspi scsi_transport_spi mptscsih ata_piix mptbase ahci usbcore libahci drm sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua
[22613.292429] CR2: 0000000000000000
[22613.293563] ---[ end trace a074b26a8508f378 ]---
[22613.295138] RIP: 0010:sysfs_kf_seq_show+0x117/0x230
[22613.296769] Code: ff 48 8b 0b 48 8b 44 24 08 48 01 e9 eb a6 31 f6 48 89 cf ba 00 10 00 00 48 89 4c 24 10 e8 b1 e6 e9 ff 4c 89 ff e8 19 07 ea ff <49> 8b 07 48 85 c0 48 89 44 24 08 0f 84 91 00 00 00 49 8b 6d 00 48
[22613.303553] RSP: 0018:ffff8881d4f8fd78 EFLAGS: 00010246
[22613.305280] RAX: 0000000000000000 RBX: ffff8881eb99b180 RCX: ffffffff810d9ef6
[22613.307924] RDX: 0000000000000001 RSI: 0000000000000246 RDI: 0000000000000246
[22613.310272] RBP: 0000000000001000 R08: fffffbfff072ddcd R09: fffffbfff072ddcd
[22613.312685] R10: 0000000000000001 R11: fffffbfff072ddcc R12: ffff8881de5c0200
[22613.315076] R13: ffff8881ed175500 R14: ffff8881eb99b198 R15: 0000000000000000
[22613.318116] FS:  00007fc2c8d3d500(0000) GS:ffff8881f2a80000(0000) knlGS:0000000000000000
[22613.320743] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[22613.322628] CR2: 0000000000000000 CR3: 00000001ec89a004 CR4: 00000000003606e0

Here this patch fixes the following problem by explicity set all the 7
members to 0 in bch_cache_accounting_clear().

Reported-by: Сорокин Артем Сергеевич <a.sorokin@bank-hlynov.ru>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-02-01 07:55:39 -07:00

248 lines
6.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* bcache stats code
*
* Copyright 2012 Google, Inc.
*/
#include "bcache.h"
#include "stats.h"
#include "btree.h"
#include "sysfs.h"
/*
* We keep absolute totals of various statistics, and addionally a set of three
* rolling averages.
*
* Every so often, a timer goes off and rescales the rolling averages.
* accounting_rescale[] is how many times the timer has to go off before we
* rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
* and one day.
*
* accounting_delay is how often the timer goes off - 22 times in 5 minutes,
* and accounting_weight is what we use to rescale:
*
* pow(31 / 32, 22) ~= 1/2
*
* So that we don't have to increment each set of numbers every time we (say)
* get a cache hit, we increment a single atomic_t in acc->collector, and when
* the rescale function runs it resets the atomic counter to 0 and adds its
* old value to each of the exported numbers.
*
* To reduce rounding error, the numbers in struct cache_stats are all
* stored left shifted by 16, and scaled back in the sysfs show() function.
*/
static const unsigned int DAY_RESCALE = 288;
static const unsigned int HOUR_RESCALE = 12;
static const unsigned int FIVE_MINUTE_RESCALE = 1;
static const unsigned int accounting_delay = (HZ * 300) / 22;
static const unsigned int accounting_weight = 32;
/* sysfs reading/writing */
read_attribute(cache_hits);
read_attribute(cache_misses);
read_attribute(cache_bypass_hits);
read_attribute(cache_bypass_misses);
read_attribute(cache_hit_ratio);
read_attribute(cache_readaheads);
read_attribute(cache_miss_collisions);
read_attribute(bypassed);
SHOW(bch_stats)
{
struct cache_stats *s =
container_of(kobj, struct cache_stats, kobj);
#define var(stat) (s->stat >> 16)
var_print(cache_hits);
var_print(cache_misses);
var_print(cache_bypass_hits);
var_print(cache_bypass_misses);
sysfs_print(cache_hit_ratio,
DIV_SAFE(var(cache_hits) * 100,
var(cache_hits) + var(cache_misses)));
var_print(cache_readaheads);
var_print(cache_miss_collisions);
sysfs_hprint(bypassed, var(sectors_bypassed) << 9);
#undef var
return 0;
}
STORE(bch_stats)
{
return size;
}
static void bch_stats_release(struct kobject *k)
{
}
static struct attribute *bch_stats_files[] = {
&sysfs_cache_hits,
&sysfs_cache_misses,
&sysfs_cache_bypass_hits,
&sysfs_cache_bypass_misses,
&sysfs_cache_hit_ratio,
&sysfs_cache_readaheads,
&sysfs_cache_miss_collisions,
&sysfs_bypassed,
NULL
};
static KTYPE(bch_stats);
int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
struct kobject *parent)
{
int ret = kobject_add(&acc->total.kobj, parent,
"stats_total");
ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
"stats_five_minute");
ret = ret ?: kobject_add(&acc->hour.kobj, parent,
"stats_hour");
ret = ret ?: kobject_add(&acc->day.kobj, parent,
"stats_day");
return ret;
}
void bch_cache_accounting_clear(struct cache_accounting *acc)
{
acc->total.cache_hits = 0;
acc->total.cache_misses = 0;
acc->total.cache_bypass_hits = 0;
acc->total.cache_bypass_misses = 0;
acc->total.cache_readaheads = 0;
acc->total.cache_miss_collisions = 0;
acc->total.sectors_bypassed = 0;
}
void bch_cache_accounting_destroy(struct cache_accounting *acc)
{
kobject_put(&acc->total.kobj);
kobject_put(&acc->five_minute.kobj);
kobject_put(&acc->hour.kobj);
kobject_put(&acc->day.kobj);
atomic_set(&acc->closing, 1);
if (del_timer_sync(&acc->timer))
closure_return(&acc->cl);
}
/* EWMA scaling */
static void scale_stat(unsigned long *stat)
{
*stat = ewma_add(*stat, 0, accounting_weight, 0);
}
static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
{
if (++stats->rescale == rescale_at) {
stats->rescale = 0;
scale_stat(&stats->cache_hits);
scale_stat(&stats->cache_misses);
scale_stat(&stats->cache_bypass_hits);
scale_stat(&stats->cache_bypass_misses);
scale_stat(&stats->cache_readaheads);
scale_stat(&stats->cache_miss_collisions);
scale_stat(&stats->sectors_bypassed);
}
}
static void scale_accounting(struct timer_list *t)
{
struct cache_accounting *acc = from_timer(acc, t, timer);
#define move_stat(name) do { \
unsigned int t = atomic_xchg(&acc->collector.name, 0); \
t <<= 16; \
acc->five_minute.name += t; \
acc->hour.name += t; \
acc->day.name += t; \
acc->total.name += t; \
} while (0)
move_stat(cache_hits);
move_stat(cache_misses);
move_stat(cache_bypass_hits);
move_stat(cache_bypass_misses);
move_stat(cache_readaheads);
move_stat(cache_miss_collisions);
move_stat(sectors_bypassed);
scale_stats(&acc->total, 0);
scale_stats(&acc->day, DAY_RESCALE);
scale_stats(&acc->hour, HOUR_RESCALE);
scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
acc->timer.expires += accounting_delay;
if (!atomic_read(&acc->closing))
add_timer(&acc->timer);
else
closure_return(&acc->cl);
}
static void mark_cache_stats(struct cache_stat_collector *stats,
bool hit, bool bypass)
{
if (!bypass)
if (hit)
atomic_inc(&stats->cache_hits);
else
atomic_inc(&stats->cache_misses);
else
if (hit)
atomic_inc(&stats->cache_bypass_hits);
else
atomic_inc(&stats->cache_bypass_misses);
}
void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
bool hit, bool bypass)
{
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
mark_cache_stats(&dc->accounting.collector, hit, bypass);
mark_cache_stats(&c->accounting.collector, hit, bypass);
}
void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
{
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
atomic_inc(&dc->accounting.collector.cache_readaheads);
atomic_inc(&c->accounting.collector.cache_readaheads);
}
void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
{
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
atomic_inc(&dc->accounting.collector.cache_miss_collisions);
atomic_inc(&c->accounting.collector.cache_miss_collisions);
}
void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
int sectors)
{
atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
}
void bch_cache_accounting_init(struct cache_accounting *acc,
struct closure *parent)
{
kobject_init(&acc->total.kobj, &bch_stats_ktype);
kobject_init(&acc->five_minute.kobj, &bch_stats_ktype);
kobject_init(&acc->hour.kobj, &bch_stats_ktype);
kobject_init(&acc->day.kobj, &bch_stats_ktype);
closure_init(&acc->cl, parent);
timer_setup(&acc->timer, scale_accounting, 0);
acc->timer.expires = jiffies + accounting_delay;
add_timer(&acc->timer);
}