From 9642c8c44d0db43bc20a166dd70ac6d2ab3ce5b9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 3 Nov 2021 16:07:36 +0100 Subject: [PATCH 001/336] gfs2: Only dereference i->iov when iter_is_iovec(i) Only dereference i->iov after establishing that i is of type ITER_IOVEC. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index adafaaf7d24d..c486b702e00f 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -773,8 +773,8 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, size_t *prev_count, size_t *window_size) { - char __user *p = i->iov[0].iov_base + i->iov_offset; size_t count = iov_iter_count(i); + char __user *p; int pages = 1; if (likely(!count)) @@ -787,14 +787,14 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, if (*prev_count != count || !*window_size) { int pages, nr_dirtied; - pages = min_t(int, BIO_MAX_VECS, - DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE)); + pages = min_t(int, BIO_MAX_VECS, DIV_ROUND_UP(count, PAGE_SIZE)); nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 1); pages = min(pages, nr_dirtied); } *prev_count = count; + p = i->iov[0].iov_base + i->iov_offset; *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); return true; } From 7a92deaae613c3b95f0fd02814bb09be7f7a5820 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 3 Nov 2021 16:15:51 +0100 Subject: [PATCH 002/336] gfs2: Fix atomic bug in gfs2_instantiate Replace test_bit() + set_bit() with test_and_set_bit() where we need an atomic operation. Use clear_and_wake_up_bit() instead of open coding it. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 19f38aee1b61..258d8aae7c53 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -496,7 +496,7 @@ again: * Since we unlock the lockref lock, we set a flag to indicate * instantiate is in progress. */ - if (test_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { + if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG, TASK_UNINTERRUPTIBLE); /* @@ -509,14 +509,10 @@ again: goto again; } - set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); - ret = glops->go_instantiate(gh); if (!ret) clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); - clear_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); - smp_mb__after_atomic(); - wake_up_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG); + clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); return ret; } From 49462e2be119d38c5eb5759d0d1b712df3a41239 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 28 Oct 2021 11:53:10 -0500 Subject: [PATCH 003/336] gfs2: release iopen glock early in evict Before this patch, evict would clear the iopen glock's gl_object after releasing the inode glock. In the meantime, another process could reuse the same block and thus glocks for a new inode. It would lock the inode glock (exclusively), and then the iopen glock (shared). The shared locking mode doesn't provide any ordering against the evict, so by the time the iopen glock is reused, evict may not have gotten to setting gl_object to NULL. Fix that by releasing the iopen glock before the inode glock in gfs2_evict_inode. Signed-off-by: Bob Peterson gl_object Signed-off-by: Andreas Gruenbacher --- fs/gfs2/super.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5b121371508a..0f93e8beca4d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1402,13 +1402,6 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - if (ip->i_gl) { - glock_clear_object(ip->i_gl, ip); - wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); - gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put_eventually(ip->i_gl); - ip->i_gl = NULL; - } if (gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; @@ -1421,6 +1414,13 @@ out: gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_put_eventually(gl); } + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); + wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); + gfs2_glock_add_to_lru(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); + ip->i_gl = NULL; + } } static struct inode *gfs2_alloc_inode(struct super_block *sb) From f3506eee81d1f700d9ee2d2f4a88fddb669ec032 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 6 Nov 2021 00:18:56 +0100 Subject: [PATCH 004/336] gfs2: Fix length of holes reported at end-of-file Fix the length of holes reported at the end of a file: the length is relative to the beginning of the extent, not the seek position which is rounded down to the filesystem block size. This bug went unnoticed for some time, but is now caught by the following assertion in iomap_iter_done(): WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos) Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 7235d539e969..d67108489148 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -940,7 +940,7 @@ do_alloc: else if (height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else - iomap->length = size - pos; + iomap->length = size - iomap->offset; } else if (flags & IOMAP_WRITE) { u64 alloc_size; From a7ac203d8fd366bb6318f856466f52484c623d05 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 8 Nov 2021 16:08:07 +0100 Subject: [PATCH 005/336] gfs2: Fix "Introduce flag for glock holder auto-demotion" Function demote_incompat_holders iterates over the list of glock holders with list_for_each_entry, and it then sometimes removes the current holder from the list. This will get the loop stuck; we must use list_for_each_entry_safe instead. Reported-by: Dan Carpenter Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 258d8aae7c53..8dbd6fe66420 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -411,14 +411,14 @@ static void do_error(struct gfs2_glock *gl, const int ret) static void demote_incompat_holders(struct gfs2_glock *gl, struct gfs2_holder *new_gh) { - struct gfs2_holder *gh; + struct gfs2_holder *gh, *tmp; /* * Demote incompatible holders before we make ourselves eligible. * (This holder may or may not allow auto-demoting, but we don't want * to demote the new holder before it's even granted.) */ - list_for_each_entry(gh, &gl->gl_holders, gh_list) { + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { /* * Since holders are at the front of the list, we stop when we * find the first non-holder. From a48fc69fe6588b48d878d69de223b91a386a7cb4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Nov 2021 15:22:35 +0100 Subject: [PATCH 006/336] udf: Fix crash after seekdir udf_readdir() didn't validate the directory position it should start reading from. Thus when user uses lseek(2) on directory file descriptor it can trick udf_readdir() into reading from a position in the middle of directory entry which then upsets directory parsing code resulting in errors or even possible kernel crashes. Similarly when the directory is modified between two readdir calls, the directory position need not be valid anymore. Add code to validate current offset in the directory. This is actually rather expensive for UDF as we need to read from the beginning of the directory and parse all directory entries. This is because in UDF a directory is just a stream of data containing directory entries and since file names are fully under user's control we cannot depend on detecting magic numbers and checksums in the header of directory entry as a malicious attacker could fake them. We skip this step if we detect that nothing changed since the last readdir call. Reported-by: Nathan Wilson CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/udf/dir.c | 32 ++++++++++++++++++++++++++++++-- fs/udf/namei.c | 3 +++ fs/udf/super.c | 2 ++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 70abdfad2df1..42e3e551fa4c 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "udf_i.h" #include "udf_sb.h" @@ -43,7 +44,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; udf_pblk_t block, iblock; - loff_t nf_pos; + loff_t nf_pos, emit_pos = 0; int flen; unsigned char *fname = NULL, *copy_name = NULL; unsigned char *nameptr; @@ -57,6 +58,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) int i, num, ret = 0; struct extent_position epos = { NULL, 0, {0, 0} }; struct super_block *sb = dir->i_sb; + bool pos_valid = false; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -67,6 +69,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) if (nf_pos >= size) goto out; + /* + * Something changed since last readdir (either lseek was called or dir + * changed)? We need to verify the position correctly points at the + * beginning of some dir entry so that the directory parsing code does + * not get confused. Since UDF does not have any reliable way of + * identifying beginning of dir entry (names are under user control), + * we need to scan the directory from the beginning. + */ + if (!inode_eq_iversion(dir, file->f_version)) { + emit_pos = nf_pos; + nf_pos = 0; + } else { + pos_valid = true; + } + fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!fname) { ret = -ENOMEM; @@ -122,13 +139,21 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) while (nf_pos < size) { struct kernel_lb_addr tloc; + loff_t cur_pos = nf_pos; - ctx->pos = (nf_pos >> 2) + 1; + /* Update file position only if we got past the current one */ + if (nf_pos >= emit_pos) { + ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; + } fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); if (!fi) goto out; + /* Still not at offset where user asked us to read from? */ + if (cur_pos < emit_pos) + continue; liu = le16_to_cpu(cfi.lengthOfImpUse); lfi = cfi.lengthFileIdent; @@ -186,8 +211,11 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } /* end while */ ctx->pos = (nf_pos >> 2) + 1; + pos_valid = true; out: + if (pos_valid) + file->f_version = inode_query_iversion(dir); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index caeef08efed2..0ed4861b038f 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -30,6 +30,7 @@ #include #include #include +#include static inline int udf_match(int len1, const unsigned char *name1, int len2, const unsigned char *name2) @@ -134,6 +135,8 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, mark_buffer_dirty_inode(fibh->ebh, inode); mark_buffer_dirty_inode(fibh->sbh, inode); } + inode_inc_iversion(inode); + return 0; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 34247fba6df9..f26b5e0b84b6 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "udf_sb.h" #include "udf_i.h" @@ -149,6 +150,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb) init_rwsem(&ei->i_data_sem); ei->cached_extent.lstart = -1; spin_lock_init(&ei->i_extent_cache_lock); + inode_set_iversion(&ei->vfs_inode, 1); return &ei->vfs_inode; } From 5d5e4522a7f404d1a96fd6c703989d32a9c9568d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 7 Nov 2021 14:51:16 +1000 Subject: [PATCH 007/336] printk: restore flushing of NMI buffers on remote CPUs after NMI backtraces printk from NMI context relies on irq work being raised on the local CPU to print to console. This can be a problem if the NMI was raised by a lockup detector to print lockup stack and regs, because the CPU may not enable irqs (because it is locked up). Introduce printk_trigger_flush() that can be called another CPU to try to get those messages to the console, call that where printk_safe_flush was previously called. Fixes: 93d102f094be ("printk: remove safe buffers") Cc: stable@vger.kernel.org # 5.15 Signed-off-by: Nicholas Piggin Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211107045116.1754411-1-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 6 ++++++ include/linux/printk.h | 4 ++++ kernel/printk/printk.c | 5 +++++ lib/nmi_backtrace.c | 6 ++++++ 4 files changed, 21 insertions(+) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index dc17d8903d4f..6b7a83d5e03e 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -186,6 +186,12 @@ static void watchdog_smp_panic(int cpu, u64 tb) if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); + /* + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. + */ + printk_trigger_flush(); + if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); diff --git a/include/linux/printk.h b/include/linux/printk.h index a1379df43251..596ad6fa0336 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -206,6 +206,7 @@ void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; +void printk_trigger_flush(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -282,6 +283,9 @@ static inline void dump_stack_lvl(const char *log_lvl) static inline void dump_stack(void) { } +static inline void printk_trigger_flush(void) +{ +} #endif #ifdef CONFIG_SMP diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 65fffa6368c9..eabe23b0a982 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3261,6 +3261,11 @@ void defer_console_output(void) preempt_enable(); } +void printk_trigger_flush(void) +{ + defer_console_output(); +} + int vprintk_deferred(const char *fmt, va_list args) { int r; diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index f9e89001b52e..199ab201d501 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -75,6 +75,12 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, touch_softlockup_watchdog(); } + /* + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. + */ + printk_trigger_flush(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } From 554c577cee95bdc1d03d9f457e57dc96eb791845 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 9 Nov 2021 13:57:17 +0100 Subject: [PATCH 008/336] gfs2: Prevent endless loops in gfs2_file_buffered_write Currently, instead of performing a short write, iomap_file_buffered_write will fail when part of its iov iterator cannot be read. In contrast, gfs2_file_buffered_write will loop around if it can read part of the iov iterator, so we can end up in an endless loop. This should be fixed in iomap_file_buffered_write (and also generic_perform_write), but this comes a bit late in the 5.16 development cycle, so work around it in the filesystem by trimming the iov iterator to the known-good size for now. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c486b702e00f..3e718cfc19a7 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1013,6 +1013,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; + size_t orig_count = iov_iter_count(from); size_t read = 0; ssize_t ret; @@ -1057,6 +1058,7 @@ retry_under_glock: if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); + from->count = orig_count - read; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { size_t leftover; @@ -1064,6 +1066,7 @@ retry_under_glock: leftover = fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); if (leftover != window_size) { + from->count = min(from->count, window_size - leftover); if (!gfs2_holder_queued(gh)) { if (read) goto out_uninit; From 4d1cd1443db3d5605ebcde8672869b1944ade92d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 8 Nov 2021 07:23:44 +0100 Subject: [PATCH 009/336] powercap: DTPM: Fix suspend failure and kernel warning When the ENERGY_MODEL and DTPM_CPU are enabled but actually without any energy model, at cpu hotplug time, the dead cpuhp callback fails leading to the warning. Actually, the check could be simplified and we only do an action if the dtpm cpu is enabled, otherwise we bail out without error. Fixes: 7a89d7eacf8e ("powercap/drivers/dtpm: Simplify the dtpm table") Reported-by: Kenneth R. Crudup Tested-by: Kenneth R. Crudup Reported-by: kernel test robot Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm_cpu.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 44faa3a74db6..b740866b228d 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -166,16 +166,13 @@ static struct dtpm_ops dtpm_ops = { static int cpuhp_dtpm_cpu_offline(unsigned int cpu) { - struct em_perf_domain *pd; struct dtpm_cpu *dtpm_cpu; - pd = em_cpu_get(cpu); - if (!pd) - return -EINVAL; - dtpm_cpu = per_cpu(dtpm_per_cpu, cpu); + if (dtpm_cpu) + dtpm_update_power(&dtpm_cpu->dtpm); - return dtpm_update_power(&dtpm_cpu->dtpm); + return 0; } static int cpuhp_dtpm_cpu_online(unsigned int cpu) From d704aa0d44ade12660d4d7220b2a8d785b7b4247 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 2 Nov 2021 18:01:43 +0000 Subject: [PATCH 010/336] Documentation: power: Add description about new callback for EM registration The Energy Model (EM) registration for CPUs should now be done using a dedicated callback added recently into CPUFreq framework and drivers. Commit c17495b01b72 ("cpufreq: Add callback to register with energy model") The callback guaranties that the EM registration is called at the right time during driver setup. To avoid mistakes update the documentation to align with the existing code implementation. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- Documentation/power/energy-model.rst | 31 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 8a2788afe89b..7af0e1760962 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -138,6 +138,10 @@ or in Section 2.4 3. Example driver ----------------- +The CPUFreq framework supports dedicated callback for registering +the EM for a given CPU(s) 'policy' object: cpufreq_driver::register_em(). +That callback has to be implemented properly for a given driver, +because the framework would call it at the right time during setup. This section provides a simple example of a CPUFreq driver registering a performance domain in the Energy Model framework using the (fake) 'foo' protocol. The driver implements an est_power() function to be provided to the @@ -167,25 +171,22 @@ EM framework:: 20 return 0; 21 } 22 - 23 static int foo_cpufreq_init(struct cpufreq_policy *policy) + 23 static void foo_cpufreq_register_em(struct cpufreq_policy *policy) 24 { 25 struct em_data_callback em_cb = EM_DATA_CB(est_power); 26 struct device *cpu_dev; - 27 int nr_opp, ret; + 27 int nr_opp; 28 29 cpu_dev = get_cpu_device(cpumask_first(policy->cpus)); 30 - 31 /* Do the actual CPUFreq init work ... */ - 32 ret = do_foo_cpufreq_init(policy); - 33 if (ret) - 34 return ret; - 35 - 36 /* Find the number of OPPs for this policy */ - 37 nr_opp = foo_get_nr_opp(policy); + 31 /* Find the number of OPPs for this policy */ + 32 nr_opp = foo_get_nr_opp(policy); + 33 + 34 /* And register the new performance domain */ + 35 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus, + 36 true); + 37 } 38 - 39 /* And register the new performance domain */ - 40 em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus, - 41 true); - 42 - 43 return 0; - 44 } + 39 static struct cpufreq_driver foo_cpufreq_driver = { + 40 .register_em = foo_cpufreq_register_em, + 41 }; From 08374410a5ea3ff2fa9a87edd7d7ab15375b1c31 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 2 Nov 2021 18:01:44 +0000 Subject: [PATCH 011/336] Documentation: power: Describe 'advanced' and 'simple' EM models The Energy Model (EM) can be registered in two ways: 1) Using a helper function, which under the hood relies on OPP framework and DT entry in CPU node: 'dynamic-power-coefficient'. This is a 'simple' EM because it's tied to the math formula: Power = dynamic-power-coefficient * V^2 * f 2) Using em_dev_register_perf_domain() API function with a driver custom callback which provides power for each performance state. This is 'advanced' EM, since it can better reflect real power measurements for each performance state. It's not limited to any math formula and can better reflect real physics of the device. Add description of these two methods to the documentation, so developers could choose the suitable registration method (option). Signed-off-by: Lukasz Luba [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- Documentation/power/energy-model.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst index 7af0e1760962..5ac62a7b4b7c 100644 --- a/Documentation/power/energy-model.rst +++ b/Documentation/power/energy-model.rst @@ -84,6 +84,16 @@ CONFIG_ENERGY_MODEL must be enabled to use the EM framework. 2.2 Registration of performance domains ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Registration of 'advanced' EM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 'advanced' EM gets it's name due to the fact that the driver is allowed +to provide more precised power model. It's not limited to some implemented math +formula in the framework (like it's in 'simple' EM case). It can better reflect +the real power measurements performed for each performance state. Thus, this +registration method should be preferred in case considering EM static power +(leakage) is important. + Drivers are expected to register performance domains into the EM framework by calling the following API:: @@ -103,6 +113,18 @@ to: return warning/error, stop working or panic. See Section 3. for an example of driver implementing this callback, or Section 2.4 for further documentation on this API +Registration of 'simple' EM +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 'simple' EM is registered using the framework helper function +cpufreq_register_em_with_opp(). It implements a power model which is tight to +math formula:: + + Power = C * V^2 * f + +The EM which is registered using this method might not reflect correctly the +physics of a real device, e.g. when static power (leakage) is important. + 2.3 Accessing performance domains ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 3cc1ae1fa70ab369e4645e38ce335a19438093ad Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 10 Nov 2021 16:36:04 +0100 Subject: [PATCH 012/336] drm/nouveau: hdmigv100.c: fix corrupted HDMI Vendor InfoFrame gv100_hdmi_ctrl() writes vendor_infoframe.subpack0_high to 0x6f0110, and then overwrites it with 0. Just drop the overwrite with 0, that's clearly a mistake. Because of this issue the HDMI VIC is 0 instead of 1 in the HDMI Vendor InfoFrame when transmitting 4kp30. Signed-off-by: Hans Verkuil Fixes: 290ffeafcc1a ("drm/nouveau/disp/gv100: initial support") Reviewed-by: Ben Skeggs Signed-off-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/3d3bd0f7-c150-2479-9350-35d394ee772d@xs4all.nl --- drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c index 6e3c450eaace..3ff49344abc7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/hdmigv100.c @@ -62,7 +62,6 @@ gv100_hdmi_ctrl(struct nvkm_ior *ior, int head, bool enable, u8 max_ac_packet, nvkm_wr32(device, 0x6f0108 + hdmi, vendor_infoframe.header); nvkm_wr32(device, 0x6f010c + hdmi, vendor_infoframe.subpack0_low); nvkm_wr32(device, 0x6f0110 + hdmi, vendor_infoframe.subpack0_high); - nvkm_wr32(device, 0x6f0110 + hdmi, 0x00000000); nvkm_wr32(device, 0x6f0114 + hdmi, 0x00000000); nvkm_wr32(device, 0x6f0118 + hdmi, 0x00000000); nvkm_wr32(device, 0x6f011c + hdmi, 0x00000000); From 199d983bc01513173dd9cc486dbddf4d0e414d42 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 11 Nov 2021 08:57:07 +0100 Subject: [PATCH 013/336] xsk: Fix crash on double free in buffer pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a crash in the buffer pool allocator when a buffer is double freed. It is possible to trigger this behavior not only from a faulty driver, but also from user space like this: Create a zero-copy AF_XDP socket. Load an XDP program that will issue XDP_DROP for all packets. Put the same umem buffer into the fill ring multiple times, then bind the socket and send some traffic. This will crash the kernel as the XDP_DROP action triggers one call to xsk_buff_free()/xp_free() for every packet dropped. Each call will add the corresponding buffer entry to the free_list and increase the free_list_cnt. Some entries will have been added multiple times due to the same buffer being freed. The buffer allocation code will then traverse this broken list and since the same buffer is in the list multiple times, it will try to delete the same buffer twice from the list leading to a crash. The fix for this is just to test that the buffer has not been added before in xp_free(). If it has been, just return from the function and do not put it in the free_list a second time. Note that this bug was not present in the code before the commit referenced in the Fixes tag. That code used one list entry per allocated buffer, so multiple frees did not have any side effects. But the commit below optimized the usage of the pool and only uses a single entry per buffer in the umem, meaning that multiple allocations/frees of the same buffer will also only use one entry, thus leading to the problem. Fixes: 47e4075df300 ("xsk: Batched buffer allocation for the pool") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20211111075707.21922-1-magnus.karlsson@gmail.com --- net/xdp/xsk_buff_pool.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 90c4e1e819d3..bc4ad48ea4f0 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -500,7 +500,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) pool->free_list_cnt--; xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); - list_del(&xskb->free_list_node); + list_del_init(&xskb->free_list_node); } xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; @@ -568,7 +568,7 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3 i = nb_entries; while (i--) { xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); - list_del(&xskb->free_list_node); + list_del_init(&xskb->free_list_node); *xdp = &xskb->xdp; xdp++; @@ -615,6 +615,9 @@ EXPORT_SYMBOL(xp_can_alloc); void xp_free(struct xdp_buff_xsk *xskb) { + if (!list_empty(&xskb->free_list_node)) + return; + xskb->pool->free_list_cnt++; list_add(&xskb->free_list_node, &xskb->pool->free_list); } From 34d11a440c6167133201b7374065b59f259730d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 10 Nov 2021 09:25:56 -0800 Subject: [PATCH 014/336] bpf: Fix inner map state pruning regression. Introduction of map_uid made two lookups from outer map to be distinct. That distinction is only necessary when inner map has an embedded timer. Otherwise it will make the verifier state pruning to be conservative which will cause complex programs to hit 1M insn_processed limit. Tighten map_uid logic to apply to inner maps with timers only. Fixes: 3e8ce29850f1 ("bpf: Prevent pointer mismatch in bpf_timer_init.") Reported-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/CACAyw99hVEJFoiBH_ZGyy=+oO-jyydoz6v1DeKPKs2HVsUH28w@mail.gmail.com Link: https://lore.kernel.org/bpf/20211110172556.20754-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 890b3ec375a3..aab7482ed1c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1151,7 +1151,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - reg->map_uid = reg->id; + if (map_value_has_timer(map->inner_map_meta)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || From 98d948eb833104a094517401ed8be26ba3ce9935 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Mon, 8 Nov 2021 14:08:54 -0600 Subject: [PATCH 015/336] spi: cadence-quadspi: fix write completion support Some versions of the Cadence QSPI controller does not have the write completion register implemented(CQSPI_REG_WR_COMPLETION_CTRL). On the Intel SoCFPGA platform the CQSPI_REG_WR_COMPLETION_CTRL register is not configured. Add a quirk to not write to the CQSPI_REG_WR_COMPLETION_CTRL register. Fixes: 9cb2ff111712 ("spi: cadence-quadspi: Disable Auto-HW polling) Signed-off-by: Dinh Nguyen Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/r/20211108200854.3616121-1-dinguyen@kernel.org Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 8b3d268ac63c..b808c94641fa 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -37,6 +37,7 @@ #define CQSPI_NEEDS_WR_DELAY BIT(0) #define CQSPI_DISABLE_DAC_MODE BIT(1) #define CQSPI_SUPPORT_EXTERNAL_DMA BIT(2) +#define CQSPI_NO_SUPPORT_WR_COMPLETION BIT(3) /* Capabilities */ #define CQSPI_SUPPORTS_OCTAL BIT(0) @@ -86,6 +87,7 @@ struct cqspi_st { struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT]; bool use_dma_read; u32 pd_dev_id; + bool wr_completion; }; struct cqspi_driver_platdata { @@ -996,9 +998,11 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata, * polling on the controller's side. spinand and spi-nor will take * care of polling the status register. */ - reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL); - reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL; - writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL); + if (cqspi->wr_completion) { + reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL); + reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL; + writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL); + } reg = readl(reg_base + CQSPI_REG_SIZE); reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK; @@ -1736,6 +1740,10 @@ static int cqspi_probe(struct platform_device *pdev) cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk); master->max_speed_hz = cqspi->master_ref_clk_hz; + + /* write completion is supported by default */ + cqspi->wr_completion = true; + ddata = of_device_get_match_data(dev); if (ddata) { if (ddata->quirks & CQSPI_NEEDS_WR_DELAY) @@ -1747,6 +1755,8 @@ static int cqspi_probe(struct platform_device *pdev) cqspi->use_direct_mode = true; if (ddata->quirks & CQSPI_SUPPORT_EXTERNAL_DMA) cqspi->use_dma_read = true; + if (ddata->quirks & CQSPI_NO_SUPPORT_WR_COMPLETION) + cqspi->wr_completion = false; if (of_device_is_compatible(pdev->dev.of_node, "xlnx,versal-ospi-1.0")) @@ -1859,6 +1869,10 @@ static const struct cqspi_driver_platdata intel_lgm_qspi = { .quirks = CQSPI_DISABLE_DAC_MODE, }; +static const struct cqspi_driver_platdata socfpga_qspi = { + .quirks = CQSPI_NO_SUPPORT_WR_COMPLETION, +}; + static const struct cqspi_driver_platdata versal_ospi = { .hwcaps_mask = CQSPI_SUPPORTS_OCTAL, .quirks = CQSPI_DISABLE_DAC_MODE | CQSPI_SUPPORT_EXTERNAL_DMA, @@ -1887,6 +1901,10 @@ static const struct of_device_id cqspi_dt_ids[] = { .compatible = "xlnx,versal-ospi-1.0", .data = (void *)&versal_ospi, }, + { + .compatible = "intel,socfpga-qspi", + .data = (void *)&socfpga_qspi, + }, { /* end of table */ } }; From 12f62a857c83b2efcbf8d9961aacd352bf81ad3d Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Mon, 8 Nov 2021 15:55:23 +0100 Subject: [PATCH 016/336] spi: lpspi: Silence error message upon deferred probe Do not print error messages with error code -517. Silences the following errors upon on imx8qm: fsl_lpspi 5a000000.spi: spi_register_controller error: -517 fsl_lpspi 5a010000.spi: spi_register_controller error: -517 fsl_lpspi 5a020000.spi: spi_register_controller error: -517 Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20211108145523.1797609-1-alexander.stein@ew.tq-group.com Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-lpspi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 5d98611dd999..c72e501c270f 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -912,7 +912,7 @@ static int fsl_lpspi_probe(struct platform_device *pdev) ret = devm_spi_register_controller(&pdev->dev, controller); if (ret < 0) { - dev_err(&pdev->dev, "spi_register_controller error.\n"); + dev_err_probe(&pdev->dev, ret, "spi_register_controller error: %i\n", ret); goto out_pm_get; } From 6532582c353f4c83e3ccdd7255020ab852b90b0b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Nov 2021 10:39:35 +0300 Subject: [PATCH 017/336] spi: spi-geni-qcom: fix error handling in spi_geni_grab_gpi_chan() This code has several issues: 1) It passes IS_ERR() to dev_err_probe() instead of PTR_ERR(). 2) It always prints an error message, even when it succeeds. 3) The "if (ret < 0) {" conditions are never true. 4) If requesting "mas->tx" fails then it sets "mas->rx" to NULL but the intention was to set "mas->tx" to NULL. Fixes: b59c122484ec ("spi: spi-geni-qcom: Add support for GPI dma") Signed-off-by: Dan Carpenter Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20211110073935.GA5176@kili Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index 27a446faf143..e2affaee4e76 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -491,22 +491,26 @@ static int spi_geni_grab_gpi_chan(struct spi_geni_master *mas) int ret; mas->tx = dma_request_chan(mas->dev, "tx"); - ret = dev_err_probe(mas->dev, IS_ERR(mas->tx), "Failed to get tx DMA ch\n"); - if (ret < 0) + if (IS_ERR(mas->tx)) { + ret = dev_err_probe(mas->dev, PTR_ERR(mas->tx), + "Failed to get tx DMA ch\n"); goto err_tx; + } mas->rx = dma_request_chan(mas->dev, "rx"); - ret = dev_err_probe(mas->dev, IS_ERR(mas->rx), "Failed to get rx DMA ch\n"); - if (ret < 0) + if (IS_ERR(mas->rx)) { + ret = dev_err_probe(mas->dev, PTR_ERR(mas->rx), + "Failed to get rx DMA ch\n"); goto err_rx; + } return 0; err_rx: - dma_release_channel(mas->tx); - mas->tx = NULL; -err_tx: mas->rx = NULL; + dma_release_channel(mas->tx); +err_tx: + mas->tx = NULL; return ret; } From 6c53b45c71b4920b5e62f0ea8079a1da382b9434 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 11 Nov 2021 09:37:13 +0100 Subject: [PATCH 018/336] spi: fix use-after-free of the add_lock mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6098475d4cb4 ("spi: Fix deadlock when adding SPI controllers on SPI buses") introduced a per-controller mutex. But mutex_unlock() of said lock is called after the controller is already freed: spi_unregister_controller(ctlr) -> put_device(&ctlr->dev) -> spi_controller_release(dev) -> mutex_unlock(&ctrl->add_lock) Move the put_device() after the mutex_unlock(). Fixes: 6098475d4cb4 ("spi: Fix deadlock when adding SPI controllers on SPI buses") Signed-off-by: Michael Walle Reviewed-by: Uwe Kleine-König Reviewed-by: Lukas Wunner Cc: stable@vger.kernel.org # v5.15 Link: https://lore.kernel.org/r/20211111083713.3335171-1-michael@walle.cc Signed-off-by: Mark Brown --- drivers/spi/spi.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 72826bdab270..a2988aed984c 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -3058,12 +3058,6 @@ void spi_unregister_controller(struct spi_controller *ctlr) device_del(&ctlr->dev); - /* Release the last reference on the controller if its driver - * has not yet been converted to devm_spi_alloc_master/slave(). - */ - if (!ctlr->devm_allocated) - put_device(&ctlr->dev); - /* free bus id */ mutex_lock(&board_lock); if (found == ctlr) @@ -3072,6 +3066,12 @@ void spi_unregister_controller(struct spi_controller *ctlr) if (IS_ENABLED(CONFIG_SPI_DYNAMIC)) mutex_unlock(&ctlr->add_lock); + + /* Release the last reference on the controller if its driver + * has not yet been converted to devm_spi_alloc_master/slave(). + */ + if (!ctlr->devm_allocated) + put_device(&ctlr->dev); } EXPORT_SYMBOL_GPL(spi_unregister_controller); From 6af2e1237412ca735e3f18f2044902b0c514f2db Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 11 Nov 2021 16:14:52 +0000 Subject: [PATCH 019/336] selftests/bpf: Check map in map pruning Ensure that two registers with a map_value loaded from a nested map are considered equivalent for the purpose of state pruning and don't cause the verifier to revisit a pruning point. This uses a rather crude match on the number of insns visited by the verifier, which might change in the future. I've therefore tried to keep the code as "unpruneable" as possible by having the code paths only converge on the second to last instruction. Should you require to adjust the test in the future, reducing the number of processed instructions should always be safe. Increasing them could cause another regression, so proceed with caution. Suggested-by: Alexei Starovoitov Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/CACAyw99hVEJFoiBH_ZGyy=+oO-jyydoz6v1DeKPKs2HVsUH28w@mail.gmail.com Link: https://lore.kernel.org/bpf/20211111161452.86864-1-lmb@cloudflare.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/verifier/map_in_map.c | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/map_in_map.c b/tools/testing/selftests/bpf/verifier/map_in_map.c index 2798927ee9ff..128a348b762d 100644 --- a/tools/testing/selftests/bpf/verifier/map_in_map.c +++ b/tools/testing/selftests/bpf/verifier/map_in_map.c @@ -18,6 +18,40 @@ .fixup_map_in_map = { 3 }, .result = ACCEPT, }, +{ + "map in map state pruning", + .insns = { + BPF_ST_MEM(0, BPF_REG_10, -4, 0), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -4), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 11), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_6), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_in_map = { 4, 14 }, + .flags = BPF_F_TEST_STATE_FREQ, + .result = VERBOSE_ACCEPT, + .errstr = "processed 25 insns", + .prog_type = BPF_PROG_TYPE_XDP, +}, { "invalid inner map pointer", .insns = { From dc14ca4644f48b1cfa93631e35c28bdc011ad109 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 11 Nov 2021 22:57:03 +0100 Subject: [PATCH 020/336] samples/bpf: Fix summary per-sec stats in xdp_sample_user sample_summary_print() uses accumulated period to calculate and display per-sec averages. This period gets incremented by sampling interval each time a new sample is formed, and thus equals to the number of samples collected multiplied by this interval. However, the totals are being calculated differently, they receive current sample statistics already divided by the interval gotten as a difference between sample timestamps for better precision -- in other words, they are being incremented by the per-sec values each sample. This leads to the excessive division of summary per-secs when interval != 1 sec. It is obvious pps couldn't become two times lower just from picking a different sampling interval value: $ samples/bpf/xdp_redirect_cpu -p xdp_prognum_n1_inverse_qnum -c all -s -d 6 -i 1 < snip > Packets received : 2,197,230,321 Average packets/s : 22,887,816 Packets redirected : 2,197,230,472 Average redir/s : 22,887,817 $ samples/bpf/xdp_redirect_cpu -p xdp_prognum_n1_inverse_qnum -c all -s -d 6 -i 2 < snip > Packets received : 159,566,498 Average packets/s : 11,397,607 Packets redirected : 159,566,995 Average redir/s : 11,397,642 This can be easily fixed by treating the divisor not as a period, but rather as a total number of samples, and thus incrementing it by 1 instead of interval. As a nice side effect, we can now remove so-named argument from a couple of functions. Let us also create an "alias" for sample_output::rx_cnt::pps named 'num' using a union since this field is used to store this number (period previously) as well, and the resulting counter-intuitive code might've been a reason for this bug. Fixes: 156f886cf697 ("samples: bpf: Add basic infrastructure for XDP samples") Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Reviewed-by: Jesse Brandeburg Reviewed-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20211111215703.690-1-alexandr.lobakin@intel.com Signed-off-by: Alexei Starovoitov --- samples/bpf/xdp_sample_user.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/samples/bpf/xdp_sample_user.c b/samples/bpf/xdp_sample_user.c index b32d82178199..8740838e7767 100644 --- a/samples/bpf/xdp_sample_user.c +++ b/samples/bpf/xdp_sample_user.c @@ -120,7 +120,10 @@ struct sample_output { __u64 xmit; } totals; struct { - __u64 pps; + union { + __u64 pps; + __u64 num; + }; __u64 drop; __u64 err; } rx_cnt; @@ -1322,7 +1325,7 @@ int sample_install_xdp(struct bpf_program *xdp_prog, int ifindex, bool generic, static void sample_summary_print(void) { - double period = sample_out.rx_cnt.pps; + double num = sample_out.rx_cnt.num; if (sample_out.totals.rx) { double pkts = sample_out.totals.rx; @@ -1330,7 +1333,7 @@ static void sample_summary_print(void) print_always(" Packets received : %'-10llu\n", sample_out.totals.rx); print_always(" Average packets/s : %'-10.0f\n", - sample_round(pkts / period)); + sample_round(pkts / num)); } if (sample_out.totals.redir) { double pkts = sample_out.totals.redir; @@ -1338,7 +1341,7 @@ static void sample_summary_print(void) print_always(" Packets redirected : %'-10llu\n", sample_out.totals.redir); print_always(" Average redir/s : %'-10.0f\n", - sample_round(pkts / period)); + sample_round(pkts / num)); } if (sample_out.totals.drop) print_always(" Rx dropped : %'-10llu\n", @@ -1355,7 +1358,7 @@ static void sample_summary_print(void) print_always(" Packets transmitted : %'-10llu\n", sample_out.totals.xmit); print_always(" Average transmit/s : %'-10.0f\n", - sample_round(pkts / period)); + sample_round(pkts / num)); } } @@ -1422,7 +1425,7 @@ static int sample_stats_collect(struct stats_record *rec) return 0; } -static void sample_summary_update(struct sample_output *out, int interval) +static void sample_summary_update(struct sample_output *out) { sample_out.totals.rx += out->totals.rx; sample_out.totals.redir += out->totals.redir; @@ -1430,12 +1433,11 @@ static void sample_summary_update(struct sample_output *out, int interval) sample_out.totals.drop_xmit += out->totals.drop_xmit; sample_out.totals.err += out->totals.err; sample_out.totals.xmit += out->totals.xmit; - sample_out.rx_cnt.pps += interval; + sample_out.rx_cnt.num++; } static void sample_stats_print(int mask, struct stats_record *cur, - struct stats_record *prev, char *prog_name, - int interval) + struct stats_record *prev, char *prog_name) { struct sample_output out = {}; @@ -1452,7 +1454,7 @@ static void sample_stats_print(int mask, struct stats_record *cur, else if (mask & SAMPLE_DEVMAP_XMIT_CNT_MULTI) stats_get_devmap_xmit_multi(cur, prev, 0, &out, mask & SAMPLE_DEVMAP_XMIT_CNT); - sample_summary_update(&out, interval); + sample_summary_update(&out); stats_print(prog_name, mask, cur, prev, &out); } @@ -1495,7 +1497,7 @@ static void swap(struct stats_record **a, struct stats_record **b) } static int sample_timer_cb(int timerfd, struct stats_record **rec, - struct stats_record **prev, int interval) + struct stats_record **prev) { char line[64] = "Summary"; int ret; @@ -1524,7 +1526,7 @@ static int sample_timer_cb(int timerfd, struct stats_record **rec, snprintf(line, sizeof(line), "%s->%s", f ?: "?", t ?: "?"); } - sample_stats_print(sample_mask, *rec, *prev, line, interval); + sample_stats_print(sample_mask, *rec, *prev, line); return 0; } @@ -1579,7 +1581,7 @@ int sample_run(int interval, void (*post_cb)(void *), void *ctx) if (pfd[0].revents & POLLIN) ret = sample_signal_cb(); else if (pfd[1].revents & POLLIN) - ret = sample_timer_cb(timerfd, &rec, &prev, interval); + ret = sample_timer_cb(timerfd, &rec, &prev); if (ret) break; From e4ac80ef8198636a23866a59575917550328886f Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Fri, 12 Nov 2021 15:51:30 +0000 Subject: [PATCH 021/336] tools/runqslower: Fix cross-build Commit be79505caf3f ("tools/runqslower: Install libbpf headers when building") uses the target libbpf to build the host bpftool, which doesn't work when cross-building: make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -C tools/bpf/runqslower O=/tmp/runqslower ... LINK /tmp/runqslower/bpftool/bpftool /usr/bin/ld: /tmp/runqslower/libbpf/libbpf.a(libbpf-in.o): Relocations in generic ELF (EM: 183) /usr/bin/ld: /tmp/runqslower/libbpf/libbpf.a: error adding symbols: file in wrong format collect2: error: ld returned 1 exit status When cross-building, the target architecture differs from the host. The bpftool used for building runqslower is executed on the host, and thus must use a different libbpf than that used for runqslower itself. Remove the LIBBPF_OUTPUT and LIBBPF_DESTDIR parameters, so the bpftool build makes its own library if necessary. In the selftests, pass the host bpftool, already a prerequisite for the runqslower recipe, as BPFTOOL_OUTPUT. The runqslower Makefile will use the bpftool that's already built for selftests instead of making a new one. Fixes: be79505caf3f ("tools/runqslower: Install libbpf headers when building") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20211112155128.565680-1-jean-philippe@linaro.org Signed-off-by: Alexei Starovoitov --- tools/bpf/runqslower/Makefile | 3 +-- tools/testing/selftests/bpf/Makefile | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/bpf/runqslower/Makefile b/tools/bpf/runqslower/Makefile index bbd1150578f7..8791d0e2762b 100644 --- a/tools/bpf/runqslower/Makefile +++ b/tools/bpf/runqslower/Makefile @@ -88,5 +88,4 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(BPFOBJ_OU $(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C ../bpftool OUTPUT=$(BPFTOOL_OUTPUT) \ - LIBBPF_OUTPUT=$(BPFOBJ_OUTPUT) \ - LIBBPF_DESTDIR=$(BPF_DESTDIR) CC=$(HOSTCC) LD=$(HOSTLD) + CC=$(HOSTCC) LD=$(HOSTLD) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 54b0a41a3775..62fafbeb4672 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -187,7 +187,7 @@ DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool $(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL) $(RUNQSLOWER_OUTPUT) $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \ OUTPUT=$(RUNQSLOWER_OUTPUT) VMLINUX_BTF=$(VMLINUX_BTF) \ - BPFTOOL_OUTPUT=$(BUILD_DIR)/bpftool/ \ + BPFTOOL_OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ BPFOBJ_OUTPUT=$(BUILD_DIR)/libbpf \ BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \ cp $(RUNQSLOWER_OUTPUT)runqslower $@ From 2453afe3845523d9dfe89dbfb3d71abfa095e260 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 12 Nov 2021 07:33:01 +0530 Subject: [PATCH 022/336] samples/bpf: Fix incorrect use of strlen in xdp_redirect_cpu Commit b599015f044d ("samples/bpf: Fix application of sizeof to pointer") tried to fix a bug where sizeof was incorrectly applied to a pointer instead of the array string was being copied to, to find the destination buffer size, but ended up using strlen, which is still incorrect. However, on closer look ifname_buf has no other use, hence directly use optarg. Fixes: b599015f044d ("samples/bpf: Fix application of sizeof to pointer") Fixes: e531a220cc59 ("samples: bpf: Convert xdp_redirect_cpu to XDP samples helper") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Daniel Borkmann Reviewed-by: Alexander Lobakin Tested-by: Alexander Lobakin Link: https://lore.kernel.org/bpf/20211112020301.528357-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- samples/bpf/xdp_redirect_cpu_user.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index d84e6949007c..a81704d3317b 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -309,7 +309,6 @@ int main(int argc, char **argv) const char *mprog_filename = NULL, *mprog_name = NULL; struct xdp_redirect_cpu *skel; struct bpf_map_info info = {}; - char ifname_buf[IF_NAMESIZE]; struct bpf_cpumap_val value; __u32 infosz = sizeof(info); int ret = EXIT_FAIL_OPTION; @@ -390,10 +389,10 @@ int main(int argc, char **argv) case 'd': if (strlen(optarg) >= IF_NAMESIZE) { fprintf(stderr, "-d/--dev name too long\n"); + usage(argv, long_options, __doc__, mask, true, skel->obj); goto end_cpu; } - safe_strncpy(ifname_buf, optarg, strlen(ifname_buf)); - ifindex = if_nametoindex(ifname_buf); + ifindex = if_nametoindex(optarg); if (!ifindex) ifindex = strtoul(optarg, NULL, 0); if (!ifindex) { From ba05fd36b8512d6aeefe9c2c5b6a25b726c4bfff Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 13 Nov 2021 04:50:22 +0530 Subject: [PATCH 023/336] libbpf: Perform map fd cleanup for gen_loader in case of error Alexei reported a fd leak issue in gen loader (when invoked from bpftool) [0]. When adding ksym support, map fd allocation was moved from stack to loader map, however I missed closing these fds (relevant when cleanup label is jumped to on error). For the success case, the allocated fd is returned in loader ctx, hence this problem is not noticed. Make three changes, first MAX_USED_MAPS in MAX_FD_ARRAY_SZ instead of MAX_USED_PROGS, the braino was not a problem until now for this case as we didn't try to close map fds (otherwise use of it would have tried closing 32 additional fds in ksym btf fd range). Then, do a cleanup for all nr_maps fds in cleanup label code, so that in case of error all temporary map fds from bpf_gen__map_create are closed. Then, adjust the cleanup label to only generate code for the required number of program and map fds. To trim code for remaining program fds, lay out prog_fd array in stack in the end, so that we can directly skip the remaining instances. Still stack size remains same, since changing that would require changes in a lot of places (including adjustment of stack_off macro), so nr_progs_sz variable is only used to track required number of iterations (and jump over cleanup size calculated from that), stack offset calculation remains unaffected. The difference for test_ksyms_module.o is as follows: libbpf: //prog cleanup iterations: before = 34, after = 5 libbpf: //maps cleanup iterations: before = 64, after = 2 Also, move allocation of gen->fd_array offset to bpf_gen__init. Since offset can now be 0, and we already continue even if add_data returns 0 in case of failure, we do not need to distinguish between 0 offset and failure case 0, as we rely on bpf_gen__finish to check errors. We can also skip check for gen->fd_array in add_*_fd functions, since bpf_gen__init will take care of it. [0]: https://lore.kernel.org/bpf/CAADnVQJ6jSitKSNKyxOrUzwY2qDRX0sPkJ=VLGHuCLVJ=qOt9g@mail.gmail.com Fixes: 18f4fccbf314 ("libbpf: Update gen_loader to emit BTF_KIND_FUNC relocations") Reported-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211112232022.899074-1-memxor@gmail.com --- tools/lib/bpf/bpf_gen_internal.h | 4 +-- tools/lib/bpf/gen_loader.c | 47 ++++++++++++++++++++------------ tools/lib/bpf/libbpf.c | 4 +-- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h index d26e5472fe50..6f3df004479b 100644 --- a/tools/lib/bpf/bpf_gen_internal.h +++ b/tools/lib/bpf/bpf_gen_internal.h @@ -45,8 +45,8 @@ struct bpf_gen { int nr_fd_array; }; -void bpf_gen__init(struct bpf_gen *gen, int log_level); -int bpf_gen__finish(struct bpf_gen *gen); +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps); +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps); void bpf_gen__free(struct bpf_gen *gen); void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size); void bpf_gen__map_create(struct bpf_gen *gen, struct bpf_create_map_params *map_attr, int map_idx); diff --git a/tools/lib/bpf/gen_loader.c b/tools/lib/bpf/gen_loader.c index 502dea53a742..9934851ccde7 100644 --- a/tools/lib/bpf/gen_loader.c +++ b/tools/lib/bpf/gen_loader.c @@ -18,7 +18,7 @@ #define MAX_USED_MAPS 64 #define MAX_USED_PROGS 32 #define MAX_KFUNC_DESCS 256 -#define MAX_FD_ARRAY_SZ (MAX_USED_PROGS + MAX_KFUNC_DESCS) +#define MAX_FD_ARRAY_SZ (MAX_USED_MAPS + MAX_KFUNC_DESCS) /* The following structure describes the stack layout of the loader program. * In addition R6 contains the pointer to context. @@ -33,8 +33,8 @@ */ struct loader_stack { __u32 btf_fd; - __u32 prog_fd[MAX_USED_PROGS]; __u32 inner_map_fd; + __u32 prog_fd[MAX_USED_PROGS]; }; #define stack_off(field) \ @@ -42,6 +42,11 @@ struct loader_stack { #define attr_field(attr, field) (attr + offsetof(union bpf_attr, field)) +static int blob_fd_array_off(struct bpf_gen *gen, int index) +{ + return gen->fd_array + index * sizeof(int); +} + static int realloc_insn_buf(struct bpf_gen *gen, __u32 size) { size_t off = gen->insn_cur - gen->insn_start; @@ -102,11 +107,15 @@ static void emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn in emit(gen, insn2); } -void bpf_gen__init(struct bpf_gen *gen, int log_level) +static int add_data(struct bpf_gen *gen, const void *data, __u32 size); +static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off); + +void bpf_gen__init(struct bpf_gen *gen, int log_level, int nr_progs, int nr_maps) { - size_t stack_sz = sizeof(struct loader_stack); + size_t stack_sz = sizeof(struct loader_stack), nr_progs_sz; int i; + gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int)); gen->log_level = log_level; /* save ctx pointer into R6 */ emit(gen, BPF_MOV64_REG(BPF_REG_6, BPF_REG_1)); @@ -118,19 +127,27 @@ void bpf_gen__init(struct bpf_gen *gen, int log_level) emit(gen, BPF_MOV64_IMM(BPF_REG_3, 0)); emit(gen, BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel)); + /* amount of stack actually used, only used to calculate iterations, not stack offset */ + nr_progs_sz = offsetof(struct loader_stack, prog_fd[nr_progs]); /* jump over cleanup code */ emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, - /* size of cleanup code below */ - (stack_sz / 4) * 3 + 2)); + /* size of cleanup code below (including map fd cleanup) */ + (nr_progs_sz / 4) * 3 + 2 + + /* 6 insns for emit_sys_close_blob, + * 6 insns for debug_regs in emit_sys_close_blob + */ + nr_maps * (6 + (gen->log_level ? 6 : 0)))); /* remember the label where all error branches will jump to */ gen->cleanup_label = gen->insn_cur - gen->insn_start; /* emit cleanup code: close all temp FDs */ - for (i = 0; i < stack_sz; i += 4) { + for (i = 0; i < nr_progs_sz; i += 4) { emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, -stack_sz + i)); emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, 1)); emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close)); } + for (i = 0; i < nr_maps; i++) + emit_sys_close_blob(gen, blob_fd_array_off(gen, i)); /* R7 contains the error code from sys_bpf. Copy it into R0 and exit. */ emit(gen, BPF_MOV64_REG(BPF_REG_0, BPF_REG_7)); emit(gen, BPF_EXIT_INSN()); @@ -160,8 +177,6 @@ static int add_data(struct bpf_gen *gen, const void *data, __u32 size) */ static int add_map_fd(struct bpf_gen *gen) { - if (!gen->fd_array) - gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int)); if (gen->nr_maps == MAX_USED_MAPS) { pr_warn("Total maps exceeds %d\n", MAX_USED_MAPS); gen->error = -E2BIG; @@ -174,8 +189,6 @@ static int add_kfunc_btf_fd(struct bpf_gen *gen) { int cur; - if (!gen->fd_array) - gen->fd_array = add_data(gen, NULL, MAX_FD_ARRAY_SZ * sizeof(int)); if (gen->nr_fd_array == MAX_KFUNC_DESCS) { cur = add_data(gen, NULL, sizeof(int)); return (cur - gen->fd_array) / sizeof(int); @@ -183,11 +196,6 @@ static int add_kfunc_btf_fd(struct bpf_gen *gen) return MAX_USED_MAPS + gen->nr_fd_array++; } -static int blob_fd_array_off(struct bpf_gen *gen, int index) -{ - return gen->fd_array + index * sizeof(int); -} - static int insn_bytes_to_bpf_size(__u32 sz) { switch (sz) { @@ -359,10 +367,15 @@ static void emit_sys_close_blob(struct bpf_gen *gen, int blob_off) __emit_sys_close(gen); } -int bpf_gen__finish(struct bpf_gen *gen) +int bpf_gen__finish(struct bpf_gen *gen, int nr_progs, int nr_maps) { int i; + if (nr_progs != gen->nr_progs || nr_maps != gen->nr_maps) { + pr_warn("progs/maps mismatch\n"); + gen->error = -EFAULT; + return gen->error; + } emit_sys_close_stack(gen, stack_off(btf_fd)); for (i = 0; i < gen->nr_progs; i++) move_stack2ctx(gen, diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a1bea1953df6..7c74342bb668 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7258,7 +7258,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) } if (obj->gen_loader) - bpf_gen__init(obj->gen_loader, attr->log_level); + bpf_gen__init(obj->gen_loader, attr->log_level, obj->nr_programs, obj->nr_maps); err = bpf_object__probe_loading(obj); err = err ? : bpf_object__load_vmlinux_btf(obj, false); @@ -7277,7 +7277,7 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr) for (i = 0; i < obj->nr_maps; i++) obj->maps[i].fd = -1; if (!err) - err = bpf_gen__finish(obj->gen_loader); + err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); } /* clean up fd_array */ From 81b1d548d00bcd028303c4f3150fa753b9b8aa71 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Thu, 11 Nov 2021 22:14:02 +0800 Subject: [PATCH 024/336] hamradio: remove needs_free_netdev to avoid UAF The former patch "defer 6pack kfree after unregister_netdev" reorders the kfree of two buffer after the unregister_netdev to prevent the race condition. It also adds free_netdev() function in sixpack_close(), which is a direct copy from the similar code in mkiss_close(). However, in sixpack driver, the flag needs_free_netdev is set to true in sp_setup(), hence the unregister_netdev() will free the netdev automatically. Therefore, as the sp is netdev_priv, use-after-free occurs. This patch removes the needs_free_netdev = true and just let the free_netdev to finish this deallocation task. Fixes: 0b9111922b1f ("hamradio: defer 6pack kfree after unregister_netdev") Signed-off-by: Lin Ma Link: https://lore.kernel.org/r/20211111141402.7551-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/hamradio/6pack.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index bfdf89e54752..8a19a06b505d 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -306,7 +306,6 @@ static void sp_setup(struct net_device *dev) { /* Finish setting up the DEVICE info. */ dev->netdev_ops = &sp_netdev_ops; - dev->needs_free_netdev = true; dev->mtu = SIXP_MTU; dev->hard_header_len = AX25_MAX_HEADER_LEN; dev->header_ops = &ax25_header_ops; From 87530779de0440f1ca3cd727299b1a78a9e564d0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 11 Nov 2021 07:50:34 -0800 Subject: [PATCH 025/336] ptp: ptp_clockmatrix: repair non-kernel-doc comment Do not use "/**" to begin a comment that is not in kernel-doc format. Prevents this docs build warning: drivers/ptp/ptp_clockmatrix.c:1679: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Maximum absolute value for write phase offset in picoseconds Then remove the kernel-doc-like function parameter descriptions since they don't add any useful info. (suggested by Jakub) Fixes: 794c3dffacc16 ("ptp: ptp_clockmatrix: Add support for FW 5.2 (8A34005)") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Min Li Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20211111155034.29153-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_clockmatrix.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index 6bc5791a7ec5..08e429a06922 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -1699,12 +1699,9 @@ static int initialize_dco_operating_mode(struct idtcm_channel *channel) /* PTP Hardware Clock interface */ -/** +/* * Maximum absolute value for write phase offset in picoseconds * - * @channel: channel - * @delta_ns: delta in nanoseconds - * * Destination signed register is 32-bit register in resolution of 50ps * * 0x7fffffff * 50 = 2147483647 * 50 = 107374182350 From 0cda7d4bac5fd29dceb13df26083333fa99d6bb4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Nov 2021 08:29:29 -0800 Subject: [PATCH 026/336] selftests: net: switch to socat in the GSO GRE test Commit a985442fdecb ("selftests: net: properly support IPv6 in GSO GRE test") is not compatible with: Ncat: Version 7.80 ( https://nmap.org/ncat ) (which is distributed with Fedora/Red Hat), tests fail with: nc: invalid option -- 'N' Let's switch to socat which is far more dependable. Fixes: 025efa0a82df ("selftests: add simple GSO GRE test") Fixes: a985442fdecb ("selftests: net: properly support IPv6 in GSO GRE test") Tested-by: Andrea Righi Link: https://lore.kernel.org/r/20211111162929.530470-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/gre_gso.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/gre_gso.sh b/tools/testing/selftests/net/gre_gso.sh index fdeb44d621eb..3224651db97b 100755 --- a/tools/testing/selftests/net/gre_gso.sh +++ b/tools/testing/selftests/net/gre_gso.sh @@ -118,16 +118,18 @@ gre_gst_test_checks() local addr=$2 local proto=$3 - $NS_EXEC nc $proto -kl $port >/dev/null & + [ "$proto" == 6 ] && addr="[$addr]" + + $NS_EXEC socat - tcp${proto}-listen:$port,reuseaddr,fork >/dev/null & PID=$! while ! $NS_EXEC ss -ltn | grep -q $port; do ((i++)); sleep 0.01; done - cat $TMPFILE | timeout 1 nc $proto -N $addr $port + cat $TMPFILE | timeout 1 socat -u STDIN TCP:$addr:$port log_test $? 0 "$name - copy file w/ TSO" ethtool -K veth0 tso off - cat $TMPFILE | timeout 1 nc $proto -N $addr $port + cat $TMPFILE | timeout 1 socat -u STDIN TCP:$addr:$port log_test $? 0 "$name - copy file w/ GSO" ethtool -K veth0 tso on @@ -155,8 +157,8 @@ gre6_gso_test() sleep 2 - gre_gst_test_checks GREv6/v4 172.16.2.2 - gre_gst_test_checks GREv6/v6 2001:db8:1::2 -6 + gre_gst_test_checks GREv6/v4 172.16.2.2 4 + gre_gst_test_checks GREv6/v6 2001:db8:1::2 6 cleanup } @@ -212,8 +214,8 @@ if [ ! -x "$(command -v ip)" ]; then exit $ksft_skip fi -if [ ! -x "$(command -v nc)" ]; then - echo "SKIP: Could not run test without nc tool" +if [ ! -x "$(command -v socat)" ]; then + echo "SKIP: Could not run test without socat tool" exit $ksft_skip fi From 27df68d579c67ef6c39a5047559b6a7c08c96219 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 11 Nov 2021 19:37:24 +0100 Subject: [PATCH 027/336] net/ipa: ipa_resource: Fix wrong for loop range The source group count was mistakenly assigned to both dst and src loops. Fix it to make IPA probe and work again. Fixes: 4fd704b3608a ("net: ipa: record number of groups in data") Acked-by: AngeloGioacchino Del Regno Reviewed-by: Marijn Suijten Signed-off-by: Konrad Dybcio Reviewed-by: Alex Elder Link: https://lore.kernel.org/r/20211111183724.593478-1-konrad.dybcio@somainline.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/ipa_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/ipa_resource.c b/drivers/net/ipa/ipa_resource.c index e3da95d69409..06cec7199382 100644 --- a/drivers/net/ipa/ipa_resource.c +++ b/drivers/net/ipa/ipa_resource.c @@ -52,7 +52,7 @@ static bool ipa_resource_limits_valid(struct ipa *ipa, return false; } - group_count = data->rsrc_group_src_count; + group_count = data->rsrc_group_dst_count; if (!group_count || group_count > IPA_RESOURCE_GROUP_MAX) return false; From aae458725412332825f31121a5feb8fd887cac5a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Nov 2021 13:08:24 -0800 Subject: [PATCH 028/336] ethernet: sis900: fix indentation A space has snuck in. Reported-by: kernel test robot Fixes: 74fad215ee3d ("ethernet: sis900: use eth_hw_addr_set()") Link: https://lore.kernel.org/r/20211111210824.676201-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sis/sis900.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c index cc2d907c4c4b..23a336c5096e 100644 --- a/drivers/net/ethernet/sis/sis900.c +++ b/drivers/net/ethernet/sis/sis900.c @@ -392,7 +392,7 @@ static int sis96x_get_mac_addr(struct pci_dev *pci_dev, /* get MAC address from EEPROM */ for (i = 0; i < 3; i++) addr[i] = read_eeprom(ioaddr, i + EEPROMMACAddr); - eth_hw_addr_set(net_dev, (u8 *)addr); + eth_hw_addr_set(net_dev, (u8 *)addr); rc = 1; break; From 70701b83e208767f2720d8cd3e6a62cddafb3a30 Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 11 Nov 2021 15:52:15 -0800 Subject: [PATCH 029/336] tcp: Fix uninitialized access in skb frags array for Rx 0cp. TCP Receive zerocopy iterates through the SKB queue via tcp_recv_skb(), acquiring a pointer to an SKB and an offset within that SKB to read from. From there, it iterates the SKB frags array to determine which offset to start remapping pages from. However, this is built on the assumption that the offset read so far within the SKB is smaller than the SKB length. If this assumption is violated, we can attempt to read an invalid frags array element, which would cause a fault. tcp_recv_skb() can cause such an SKB to be returned when the TCP FIN flag is set. Therefore, we must guard against this occurrence inside skb_advance_frag(). One way that we can reproduce this error follows: 1) In a receiver program, call getsockopt(TCP_ZEROCOPY_RECEIVE) with: char some_array[32 * 1024]; struct tcp_zerocopy_receive zc = { .copybuf_address = (__u64) &some_array[0], .copybuf_len = 32 * 1024, }; 2) In a sender program, after a TCP handshake, send the following sequence of packets: i) Seq = [X, X+4000] ii) Seq = [X+4000, X+5000] iii) Seq = [X+4000, X+5000], Flags = FIN | URG, urgptr=1000 (This can happen without URG, if we have a signal pending, but URG is a convenient way to reproduce the behaviour). In this case, the following event sequence will occur on the receiver: tcp_zerocopy_receive(): -> receive_fallback_to_copy() // copybuf_len >= inq -> tcp_recvmsg_locked() // reads 5000 bytes, then breaks due to URG -> tcp_recv_skb() // yields skb with skb->len == offset -> tcp_zerocopy_set_hint_for_skb() -> skb_advance_to_frag() // will returns a frags ptr. >= nr_frags -> find_next_mappable_frag() // will dereference this bad frags ptr. With this patch, skb_advance_to_frag() will no longer return an invalid frags pointer, and will return NULL instead, fixing the issue. Signed-off-by: Arjun Roy Signed-off-by: Eric Dumazet Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive") Link: https://lore.kernel.org/r/20211111235215.2605384-1-arjunroy.kdev@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b7796b4cf0a0..bbb3d39c69af 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1758,6 +1758,9 @@ static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, { skb_frag_t *frag; + if (unlikely(offset_skb >= skb->len)) + return NULL; + offset_skb -= skb_headlen(skb); if ((int)offset_skb < 0 || skb_has_frag_list(skb)) return NULL; From 1aa3b2207e889a948049c9a8016cedb0218c2389 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 12 Nov 2021 18:18:10 -0500 Subject: [PATCH 030/336] net,lsm,selinux: revert the security_sctp_assoc_established() hook This patch reverts two prior patches, e7310c94024c ("security: implement sctp_assoc_established hook in selinux") and 7c2ef0240e6a ("security: add sctp_assoc_established hook"), which create the security_sctp_assoc_established() LSM hook and provide a SELinux implementation. Unfortunately these two patches were merged without proper review (the Reviewed-by and Tested-by tags from Richard Haines were for previous revisions of these patches that were significantly different) and there are outstanding objections from the SELinux maintainers regarding these patches. Work is currently ongoing to correct the problems identified in the reverted patches, as well as others that have come up during review, but it is unclear at this point in time when that work will be ready for inclusion in the mainline kernel. In the interest of not keeping objectionable code in the kernel for multiple weeks, and potentially a kernel release, we are reverting the two problematic patches. Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- Documentation/security/SCTP.rst | 22 ++++++++++++---------- include/linux/lsm_hook_defs.h | 2 -- include/linux/lsm_hooks.h | 5 ----- include/linux/security.h | 7 ------- net/sctp/sm_statefuns.c | 2 +- security/security.c | 7 ------- security/selinux/hooks.c | 14 +------------- 7 files changed, 14 insertions(+), 45 deletions(-) diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index 406cc68b8808..d5fd6ccc3dcb 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -15,7 +15,10 @@ For security module support, three SCTP specific hooks have been implemented:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - security_sctp_assoc_established() + +Also the following security hook has been utilised:: + + security_inet_conn_established() The usage of these hooks are described below with the SELinux implementation described in the `SCTP SELinux Support`_ chapter. @@ -119,12 +122,11 @@ calls **sctp_peeloff**\(3). @newsk - pointer to new sock structure. -security_sctp_assoc_established() +security_inet_conn_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Called when a COOKIE ACK is received, and the peer secid will be -saved into ``@asoc->peer_secid`` for client:: +Called when a COOKIE ACK is received:: - @asoc - pointer to sctp association structure. + @sk - pointer to sock structure. @skb - pointer to skbuff of the COOKIE ACK packet. @@ -132,7 +134,7 @@ Security Hooks used for Association Establishment ------------------------------------------------- The following diagram shows the use of ``security_sctp_bind_connect()``, -``security_sctp_assoc_request()``, ``security_sctp_assoc_established()`` when +``security_sctp_assoc_request()``, ``security_inet_conn_established()`` when establishing an association. :: @@ -170,7 +172,7 @@ establishing an association. <------------------------------------------- COOKIE ACK | | sctp_sf_do_5_1E_ca | - Call security_sctp_assoc_established() | + Call security_inet_conn_established() | to set the peer label. | | | | If SCTP_SOCKET_TCP or peeled off @@ -196,7 +198,7 @@ hooks with the SELinux specifics expanded below:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - security_sctp_assoc_established() + security_inet_conn_established() security_sctp_assoc_request() @@ -269,12 +271,12 @@ sockets sid and peer sid to that contained in the ``@asoc sid`` and @newsk - pointer to new sock structure. -security_sctp_assoc_established() +security_inet_conn_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Called when a COOKIE ACK is received where it sets the connection's peer sid to that in ``@skb``:: - @asoc - pointer to sctp association structure. + @sk - pointer to sock structure. @skb - pointer to skbuff of the COOKIE ACK packet. diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 442a611fa0fb..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) -LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, - struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d6823214d5c1..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,11 +1050,6 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. - * @sctp_assoc_established: - * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet - * to the security module. - * @asoc pointer to sctp association structure. - * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/security.h b/include/linux/security.h index 06eac4e61a13..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); -void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } - -static inline void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ -} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 39ba82ee87ce..354c1c4de19b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -946,7 +946,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL()); /* Set peer label for connection. */ - security_sctp_assoc_established((struct sctp_association *)asoc, chunk->skb); + security_inet_conn_established(ep->base.sk, chunk->skb); /* RFC 2960 5.1 Normal Establishment of an Association * diff --git a/security/security.c b/security/security.c index 779a9edea0a0..c88167a414b4 100644 --- a/security/security.c +++ b/security/security.c @@ -2388,13 +2388,6 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, } EXPORT_SYMBOL(security_sctp_sk_clone); -void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ - call_void_hook(sctp_assoc_established, asoc, skb); -} -EXPORT_SYMBOL(security_sctp_assoc_established); - #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 5e5215fe2e83..62d30c0a30c2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5502,8 +5502,7 @@ static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); - if (asoc->secid != SECSID_WILD) - newsksec->sid = asoc->secid; + newsksec->sid = asoc->secid; newsksec->peer_sid = asoc->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); @@ -5559,16 +5558,6 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } -static void selinux_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ - struct sk_security_struct *sksec = asoc->base.sk->sk_security; - - selinux_inet_conn_established(asoc->base.sk, skb); - asoc->peer_secid = sksec->peer_sid; - asoc->secid = SECSID_WILD; -} - static int selinux_secmark_relabel_packet(u32 sid) { const struct task_security_struct *__tsec; @@ -7239,7 +7228,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request), LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone), LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect), - LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established), LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request), LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone), LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established), From 938aa33f14657c9ed9deea348b7d6f14b6d69cb7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 14 Nov 2021 13:28:34 -0500 Subject: [PATCH 031/336] tracing: Add length protection to histogram string copies The string copies to the histogram storage has a max size of 256 bytes (defined by MAX_FILTER_STR_VAL). Only the string size of the event field needs to be copied to the event storage, but no more than what is in the event storage. Although nothing should be bigger than 256 bytes, there's no protection against overwriting of the storage if one day there is. Copy no more than the destination size, and enforce it. Also had to turn MAX_FILTER_STR_VAL into an unsigned int, to keep the min() comparison of the string sizes of comparable types. Link: https://lore.kernel.org/all/CAHk-=wjREUihCGrtRBwfX47y_KrLCGjiq3t6QtoNJpmVrAEb1w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211114132834.183429a4@rorschach.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Reported-by: Linus Torvalds Reviewed-by: Masami Hiramatsu Fixes: 63f84ae6b82b ("tracing/histogram: Do not copy the fixed-size char array field over the field size") Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- kernel/trace/trace_events_hist.c | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 50453b287615..2d167ac3452c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,7 +673,7 @@ struct trace_event_file { #define PERF_MAX_TRACE_SIZE 8192 -#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ +#define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1475d7347fe0..34afcaebd0e5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3026,8 +3026,10 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, if (val->flags & HIST_FIELD_FL_STRING) { char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; + unsigned int size; - strscpy(str, val_str, val->size); + size = min(val->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -4914,6 +4916,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, if (hist_field->flags & HIST_FIELD_FL_STRING) { unsigned int str_start, var_str_idx, idx; char *str, *val_str; + unsigned int size; str_start = hist_data->n_field_var_str + hist_data->n_save_var_str; @@ -4922,7 +4925,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - strscpy(str, val_str, hist_field->size); + + size = min(hist_field->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); hist_val = (u64)(uintptr_t)str; } From 158ea2d2b2ff8fb49f39209a31b4920f13193a3d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 14 Nov 2021 20:48:44 -0600 Subject: [PATCH 032/336] kbuild: Fix -Wimplicit-fallthrough=5 error for GCC 5.x and 6.x -Wimplicit-fallthrough=5 was under cc-option because it was only available in GCC 7.x and newer so the build is now broken for GCC 5.x and 6.x: gcc: error: unrecognized command line option '-Wimplicit-fallthrough=5'; did you mean '-Wno-fallthrough'? Fix this by moving -Wimplicit-fallthrough=5 under cc-option. Fixes: dee2b702bcf0 ("kconfig: Add support for -Wimplicit-fallthrough") Reported-by: Nathan Chancellor Co-developed-by: Nathan Chancellor Signed-off-by: Nathan Chancellor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Linus Torvalds --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 036b750e8d8a..4b7bac10c72d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -887,7 +887,7 @@ config CC_HAS_INT128 config CC_IMPLICIT_FALLTHROUGH string - default "-Wimplicit-fallthrough=5" if CC_IS_GCC + default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5) default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough) # From dae581864609d36fb58855fd59880b4941ce9d14 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 24 Sep 2021 01:10:31 +1000 Subject: [PATCH 033/336] KVM: PPC: Book3S HV: Use GLOBAL_TOC for kvmppc_h_set_dabr/xdabr() kvmppc_h_set_dabr(), and kvmppc_h_set_xdabr() which jumps into it, need to use _GLOBAL_TOC to setup the kernel TOC pointer, because kvmppc_h_set_dabr() uses LOAD_REG_ADDR() to load dawr_force_enable. When called from hcall_try_real_mode() we have the kernel TOC in r2, established near the start of kvmppc_interrupt_hv(), so there is no issue. But they can also be called from kvmppc_pseries_do_hcall() which is module code, so the access ends up happening with the kvm-hv module's r2, which will not point at dawr_force_enable and could even cause a fault. With the current code layout and compilers we haven't observed a fault in practice, the load hits somewhere in kvm-hv.ko and silently returns some bogus value. Note that we we expect p8/p9 guests to use the DAWR, but SLOF uses h_set_dabr() to test if sc1 works correctly, see SLOF's lib/libhvcall/brokensc1.c. Fixes: c1fe190c0672 ("powerpc: Add force enable of DAWR on P9 option") Signed-off-by: Michael Ellerman Reviewed-by: Daniel Axtens Link: https://lore.kernel.org/r/20210923151031.72408-1-mpe@ellerman.id.au --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index eb776d0c5d8e..32a4b4d412b9 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2005,7 +2005,7 @@ hcall_real_table: .globl hcall_real_table_end hcall_real_table_end: -_GLOBAL(kvmppc_h_set_xdabr) +_GLOBAL_TOC(kvmppc_h_set_xdabr) EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr) andi. r0, r5, DABRX_USER | DABRX_KERNEL beq 6f @@ -2015,7 +2015,7 @@ EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr) 6: li r3, H_PARAMETER blr -_GLOBAL(kvmppc_h_set_dabr) +_GLOBAL_TOC(kvmppc_h_set_dabr) EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr) li r5, DABRX_USER | DABRX_KERNEL 3: From 2da516d7ed0865bf9835830907f2f6631006d628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 10 Nov 2021 12:07:39 +0100 Subject: [PATCH 034/336] powerpc/83xx/mpc8349emitx: Drop unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5d354dc35ebb ("powerpc/83xx/mpc8349emitx: Make mcu_gpiochip_remove() return void") removed the usage of the variable ret, but failed to remove the variable itself, resulting in: arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c: In function ‘mcu_remove’: arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c:189:6: error: unused variable ‘ret’ [-Werror=unused-variable] 189 | int ret; | ^~~ So remove the variable now. Reported-by: Linux Kernel Functional Testing Signed-off-by: Uwe Kleine-König Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211110110739.1072634-1-u.kleine-koenig@pengutronix.de --- arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c index bb789f33c70e..a38372f9ac12 100644 --- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c +++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c @@ -186,7 +186,6 @@ err: static int mcu_remove(struct i2c_client *client) { struct mcu *mcu = i2c_get_clientdata(client); - int ret; kthread_stop(shutdown_thread); From 964c33cd0be621b291b5d253d8731eb2680082cb Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 10 Nov 2021 03:50:15 +0900 Subject: [PATCH 035/336] powerpc: clean vdso32 and vdso64 directories Since commit bce74491c300 ("powerpc/vdso: fix unnecessary rebuilds of vgettimeofday.o"), "make ARCH=powerpc clean" does not clean up the arch/powerpc/kernel/{vdso32,vdso64} directories. Use the subdir- trick to let "make clean" descend into them. Fixes: bce74491c300 ("powerpc/vdso: fix unnecessary rebuilds of vgettimeofday.o") Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211109185015.615517-1-masahiroy@kernel.org --- arch/powerpc/kernel/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 0e3640e14eb1..5fa68c2ef1f8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -196,3 +196,6 @@ clean-files := vmlinux.lds # Force dependency (incbin is bad) $(obj)/vdso32_wrapper.o : $(obj)/vdso32/vdso32.so.dbg $(obj)/vdso64_wrapper.o : $(obj)/vdso64/vdso64.so.dbg + +# for cleaning +subdir- += vdso32 vdso64 From 0bd81274e3f1195ee7c820ef02d62f31077c42c3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 9 Nov 2021 16:48:59 +1000 Subject: [PATCH 036/336] powerpc/pseries: rename numa_dist_table to form2_distances The name of the local variable holding the "form2" property address conflicts with the numa_distance_table global. This patch does 's/numa_dist_table/form2_distances/g' over the function, which also renames numa_dist_table_length to form2_distances_length. Suggested-by: Michael Ellerman Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211109064900.2041386-1-npiggin@gmail.com --- arch/powerpc/mm/numa.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 6f14c8fb6359..53e990140916 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -376,9 +376,9 @@ static void initialize_form2_numa_distance_lookup_table(void) { int i, j; struct device_node *root; - const __u8 *numa_dist_table; + const __u8 *form2_distances; const __be32 *numa_lookup_index; - int numa_dist_table_length; + int form2_distances_length; int max_numa_index, distance_index; if (firmware_has_feature(FW_FEATURE_OPAL)) @@ -392,20 +392,20 @@ static void initialize_form2_numa_distance_lookup_table(void) max_numa_index = of_read_number(&numa_lookup_index[0], 1); /* first element of the array is the size and is encode-int */ - numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL); - numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1); + form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL); + form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1); /* Skip the size which is encoded int */ - numa_dist_table += sizeof(__be32); + form2_distances += sizeof(__be32); - pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n", - numa_dist_table_length, max_numa_index); + pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n", + form2_distances_length, max_numa_index); for (i = 0; i < max_numa_index; i++) /* +1 skip the max_numa_index in the property */ numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); - if (numa_dist_table_length != max_numa_index * max_numa_index) { + if (form2_distances_length != max_numa_index * max_numa_index) { WARN(1, "Wrong NUMA distance information\n"); /* consider everybody else just remote. */ for (i = 0; i < max_numa_index; i++) { @@ -427,7 +427,7 @@ static void initialize_form2_numa_distance_lookup_table(void) int nodeA = numa_id_index_table[i]; int nodeB = numa_id_index_table[j]; - numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++]; + numa_distance_table[nodeA][nodeB] = form2_distances[distance_index++]; pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); } } From 302039466f6a3b9421ecb9a6a2c528801dc24a86 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 9 Nov 2021 16:49:00 +1000 Subject: [PATCH 037/336] powerpc/pseries: Fix numa FORM2 parsing fallback code In case the FORM2 distance table from firmware is not the expected size, there is fallback code that just populates the lookup table as local vs remote. However it then continues on to use the distance table. Fix. Fixes: 1c6b5a7e7405 ("powerpc/pseries: Add support for FORM2 associativity") Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211109064900.2041386-2-npiggin@gmail.com --- arch/powerpc/mm/numa.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 53e990140916..59d3cfcd7887 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -407,30 +407,26 @@ static void initialize_form2_numa_distance_lookup_table(void) if (form2_distances_length != max_numa_index * max_numa_index) { WARN(1, "Wrong NUMA distance information\n"); - /* consider everybody else just remote. */ - for (i = 0; i < max_numa_index; i++) { - for (j = 0; j < max_numa_index; j++) { - int nodeA = numa_id_index_table[i]; - int nodeB = numa_id_index_table[j]; - - if (nodeA == nodeB) - numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE; - else - numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE; - } - } + form2_distances = NULL; // don't use it } - distance_index = 0; for (i = 0; i < max_numa_index; i++) { for (j = 0; j < max_numa_index; j++) { int nodeA = numa_id_index_table[i]; int nodeB = numa_id_index_table[j]; + int dist; - numa_distance_table[nodeA][nodeB] = form2_distances[distance_index++]; - pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); + if (form2_distances) + dist = form2_distances[distance_index++]; + else if (nodeA == nodeB) + dist = LOCAL_DISTANCE; + else + dist = REMOTE_DISTANCE; + numa_distance_table[nodeA][nodeB] = dist; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist); } } + of_node_put(root); } From 2d33f5504490a9d90924476dbccd4a5349ee1ad0 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 8 Nov 2021 15:03:17 +1100 Subject: [PATCH 038/336] powerpc/pseries/ddw: Revert "Extend upper limit for huge DMA window for persistent memory" This reverts commit 54fc3c681ded9437e4548e2501dc1136b23cfa9a which does not allow 1:1 mapping even for the system RAM which is usually possible. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211108040320.3857636-2-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 49b401536d29..64385d6f33c2 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1094,15 +1094,6 @@ static phys_addr_t ddw_memory_hotplug_max(void) phys_addr_t max_addr = memory_hotplug_max(); struct device_node *memory; - /* - * The "ibm,pmemory" can appear anywhere in the address space. - * Assuming it is still backed by page structs, set the upper limit - * for the huge DMA window as MAX_PHYSMEM_BITS. - */ - if (of_find_node_by_type(NULL, "ibm,pmemory")) - return (sizeof(phys_addr_t) * 8 <= MAX_PHYSMEM_BITS) ? - (phys_addr_t) -1 : (1ULL << MAX_PHYSMEM_BITS); - for_each_node_by_type(memory, "memory") { unsigned long start, size; int n_mem_addr_cells, n_mem_size_cells, len; From fb4ee2b30cd09e95524640149e4ee0d7f22c3e7b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 8 Nov 2021 15:03:18 +1100 Subject: [PATCH 039/336] powerpc/pseries/ddw: simplify enable_ddw() This drops rather useless ddw_enabled flag as direct_mapping implies it anyway. While at this, fix indents in enable_ddw(). This should not cause any behavioral change. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211108040320.3857636-3-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 64385d6f33c2..301fa5b3d528 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1229,7 +1229,6 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct dma_win *window; struct property *win64; - bool ddw_enabled = false; struct failed_ddw_pdn *fpdn; bool default_win_removed = false, direct_mapping = false; bool pmem_present; @@ -1244,7 +1243,6 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { direct_mapping = (len >= max_ram_len); - ddw_enabled = true; goto out_unlock; } @@ -1397,8 +1395,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", dn, ret); - /* Make sure to clean DDW if any TCE was set*/ - clean_dma_window(pdn, win64->value); + /* Make sure to clean DDW if any TCE was set*/ + clean_dma_window(pdn, win64->value); goto out_del_list; } } else { @@ -1445,7 +1443,6 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) spin_unlock(&dma_win_list_lock); dev->dev.archdata.dma_offset = win_addr; - ddw_enabled = true; goto out_unlock; out_del_list: @@ -1481,10 +1478,10 @@ out_unlock: * as RAM, then we failed to create a window to cover persistent * memory and need to set the DMA limit. */ - if (pmem_present && ddw_enabled && direct_mapping && len == max_ram_len) + if (pmem_present && direct_mapping && len == max_ram_len) dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); - return ddw_enabled && direct_mapping; + return direct_mapping; } static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) From ad3976025b311cdeb822ad3e7a7554018cb0f83f Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 8 Nov 2021 15:03:19 +1100 Subject: [PATCH 040/336] powerpc/pseries/ddw: Do not try direct mapping with persistent memory and one window There is a possibility of having just one DMA window available with a limited capacity which the existing code does not handle that well. If the window is big enough for the system RAM but less than MAX_PHYSMEM_BITS (which we want when persistent memory is present), we create 1:1 window and leave persistent memory without DMA. This disables 1:1 mapping entirely if there is persistent memory and either: - the huge DMA window does not cover the entire address space; - the default DMA window is removed. This relies on reverted 54fc3c681ded ("powerpc/pseries/ddw: Extend upper limit for huge DMA window for persistent memory") to return the actual amount RAM in ddw_memory_hotplug_max() (posted separately). Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211108040320.3857636-4-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 301fa5b3d528..8f998e55735b 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -1356,8 +1356,10 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) len = order_base_2(query.largest_available_block << page_shift); win_name = DMA64_PROPNAME; } else { - direct_mapping = true; - win_name = DIRECT64_PROPNAME; + direct_mapping = !default_win_removed || + (len == MAX_PHYSMEM_BITS) || + (!pmem_present && (len == max_ram_len)); + win_name = direct_mapping ? DIRECT64_PROPNAME : DMA64_PROPNAME; } ret = create_ddw(dev, ddw_avail, &create, page_shift, len); From 995f54ea962e03ec08b8bc6a4fe11a32b420edd3 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 8 Jul 2021 19:51:46 +0200 Subject: [PATCH 041/336] drm/cma-helper: Release non-coherent memory with dma_free_noncoherent() The GEM CMA helpers allocate non-coherent (i.e., cached) backing storage with dma_alloc_noncoherent(), but release it with dma_free_wc(). Fix this with a call to dma_free_noncoherent(). Writecombining storage is still released with dma_free_wc(). Signed-off-by: Thomas Zimmermann Fixes: cf8ccbc72d61 ("drm: Add support for GEM buffers backed by non-coherent memory") Acked-by: Paul Cercueil Cc: Thomas Zimmermann Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v5.14+ Link: https://patchwork.freedesktop.org/patch/msgid/20210708175146.10618-1-tzimmermann@suse.de --- drivers/gpu/drm/drm_gem_cma_helper.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c index d53388199f34..9d05674550a4 100644 --- a/drivers/gpu/drm/drm_gem_cma_helper.c +++ b/drivers/gpu/drm/drm_gem_cma_helper.c @@ -210,8 +210,13 @@ void drm_gem_cma_free_object(struct drm_gem_object *gem_obj) dma_buf_vunmap(gem_obj->import_attach->dmabuf, &map); drm_prime_gem_destroy(gem_obj, cma_obj->sgt); } else if (cma_obj->vaddr) { - dma_free_wc(gem_obj->dev->dev, cma_obj->base.size, - cma_obj->vaddr, cma_obj->paddr); + if (cma_obj->map_noncoherent) + dma_free_noncoherent(gem_obj->dev->dev, cma_obj->base.size, + cma_obj->vaddr, cma_obj->paddr, + DMA_TO_DEVICE); + else + dma_free_wc(gem_obj->dev->dev, cma_obj->base.size, + cma_obj->vaddr, cma_obj->paddr); } drm_gem_object_release(gem_obj); From 6d6a8d6a4ed03702fe73cc7770acddda5ecc8a15 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Wed, 10 Nov 2021 18:16:48 +0900 Subject: [PATCH 042/336] docs: Update Sphinx requirements Commit f546ff0c0c07 ("Move our minimum Sphinx version to 1.7") raised the minimum version to 1.7. For pdfdocs, sphinx_pre_install says: note: If you want pdf, you need at least Sphinx 2.4.4. , and current requirements.txt installs Sphinx 2.4.4. Update Sphinx versions mentioned in docs and remove a note on earlier Sphinx versions. Update zh_CN and it_IT translations as well. Signed-off-by: Akira Yokosawa Cc: Federico Vaga Cc: Alex Shi Reviewed-by: Alex Shi Signed-off-by: Jonathan Corbet --- Documentation/doc-guide/sphinx.rst | 22 ++++++++---------- Documentation/process/changes.rst | 2 +- .../translations/it_IT/doc-guide/sphinx.rst | 23 ++++++++----------- .../translations/it_IT/process/changes.rst | 2 +- .../translations/zh_CN/doc-guide/sphinx.rst | 21 ++++++++--------- 5 files changed, 29 insertions(+), 41 deletions(-) diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst index ec3e71f56009..e445cb146efe 100644 --- a/Documentation/doc-guide/sphinx.rst +++ b/Documentation/doc-guide/sphinx.rst @@ -27,7 +27,7 @@ Sphinx Install ============== The ReST markups currently used by the Documentation/ files are meant to be -built with ``Sphinx`` version 1.3 or higher. +built with ``Sphinx`` version 1.7 or higher. There's a script that checks for the Sphinx requirements. Please see :ref:`sphinx-pre-install` for further details. @@ -43,10 +43,6 @@ or ``virtualenv``, depending on how your distribution packaged Python 3. .. note:: - #) Sphinx versions below 1.5 don't work properly with Python's - docutils version 0.13.1 or higher. So, if you're willing to use - those versions, you should run ``pip install 'docutils==0.12'``. - #) It is recommended to use the RTD theme for html output. Depending on the Sphinx version, it should be installed separately, with ``pip install sphinx_rtd_theme``. @@ -55,13 +51,13 @@ or ``virtualenv``, depending on how your distribution packaged Python 3. those expressions are written using LaTeX notation. It needs texlive installed with amsfonts and amsmath in order to evaluate them. -In summary, if you want to install Sphinx version 1.7.9, you should do:: +In summary, if you want to install Sphinx version 2.4.4, you should do:: - $ virtualenv sphinx_1.7.9 - $ . sphinx_1.7.9/bin/activate - (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt + $ virtualenv sphinx_2.4.4 + $ . sphinx_2.4.4/bin/activate + (sphinx_2.4.4) $ pip install -r Documentation/sphinx/requirements.txt -After running ``. sphinx_1.7.9/bin/activate``, the prompt will change, +After running ``. sphinx_2.4.4/bin/activate``, the prompt will change, in order to indicate that you're using the new environment. If you open a new shell, you need to rerun this command to enter again at the virtual environment before building the documentation. @@ -81,7 +77,7 @@ output. PDF and LaTeX builds -------------------- -Such builds are currently supported only with Sphinx versions 1.4 and higher. +Such builds are currently supported only with Sphinx versions 2.4 and higher. For PDF and LaTeX output, you'll also need ``XeLaTeX`` version 3.14159265. @@ -104,8 +100,8 @@ command line options for your distro:: You should run: sudo dnf install -y texlive-luatex85 - /usr/bin/virtualenv sphinx_1.7.9 - . sphinx_1.7.9/bin/activate + /usr/bin/virtualenv sphinx_2.4.4 + . sphinx_2.4.4/bin/activate pip install -r Documentation/sphinx/requirements.txt Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468. diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index e35ab74a0f80..b398b8576417 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -54,7 +54,7 @@ mcelog 0.6 mcelog --version iptables 1.4.2 iptables -V openssl & libcrypto 1.0.0 openssl version bc 1.06.95 bc --version -Sphinx\ [#f1]_ 1.3 sphinx-build --version +Sphinx\ [#f1]_ 1.7 sphinx-build --version ====================== =============== ======================================== .. [#f1] Sphinx is needed only to build the Kernel documentation diff --git a/Documentation/translations/it_IT/doc-guide/sphinx.rst b/Documentation/translations/it_IT/doc-guide/sphinx.rst index 0046d75d9a70..9762452c584c 100644 --- a/Documentation/translations/it_IT/doc-guide/sphinx.rst +++ b/Documentation/translations/it_IT/doc-guide/sphinx.rst @@ -35,7 +35,7 @@ Installazione Sphinx ==================== I marcatori ReST utilizzati nei file in Documentation/ sono pensati per essere -processati da ``Sphinx`` nella versione 1.3 o superiore. +processati da ``Sphinx`` nella versione 1.7 o superiore. Esiste uno script che verifica i requisiti Sphinx. Per ulteriori dettagli consultate :ref:`it_sphinx-pre-install`. @@ -53,11 +53,6 @@ pacchettizzato dalla vostra distribuzione. .. note:: - #) Le versioni di Sphinx inferiori alla 1.5 non funzionano bene - con il pacchetto Python docutils versione 0.13.1 o superiore. - Se volete usare queste versioni, allora dovere eseguire - ``pip install 'docutils==0.12'``. - #) Viene raccomandato l'uso del tema RTD per la documentazione in HTML. A seconda della versione di Sphinx, potrebbe essere necessaria l'installazione tramite il comando ``pip install sphinx_rtd_theme``. @@ -67,13 +62,13 @@ pacchettizzato dalla vostra distribuzione. utilizzando LaTeX. Per una corretta interpretazione, è necessario aver installato texlive con i pacchetti amdfonts e amsmath. -Riassumendo, se volete installare la versione 1.7.9 di Sphinx dovete eseguire:: +Riassumendo, se volete installare la versione 2.4.4 di Sphinx dovete eseguire:: - $ virtualenv sphinx_1.7.9 - $ . sphinx_1.7.9/bin/activate - (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt + $ virtualenv sphinx_2.4.4 + $ . sphinx_2.4.4/bin/activate + (sphinx_2.4.4) $ pip install -r Documentation/sphinx/requirements.txt -Dopo aver eseguito ``. sphinx_1.7.9/bin/activate``, il prompt cambierà per +Dopo aver eseguito ``. sphinx_2.4.4/bin/activate``, il prompt cambierà per indicare che state usando il nuovo ambiente. Se aprite un nuova sessione, prima di generare la documentazione, dovrete rieseguire questo comando per rientrare nell'ambiente virtuale. @@ -94,7 +89,7 @@ Generazione in PDF e LaTeX -------------------------- Al momento, la generazione di questi documenti è supportata solo dalle -versioni di Sphinx superiori alla 1.4. +versioni di Sphinx superiori alla 2.4. Per la generazione di PDF e LaTeX, avrete bisogno anche del pacchetto ``XeLaTeX`` nella versione 3.14159265 @@ -119,8 +114,8 @@ l'installazione:: You should run: sudo dnf install -y texlive-luatex85 - /usr/bin/virtualenv sphinx_1.7.9 - . sphinx_1.7.9/bin/activate + /usr/bin/virtualenv sphinx_2.4.4 + . sphinx_2.4.4/bin/activate pip install -r Documentation/sphinx/requirements.txt Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468. diff --git a/Documentation/translations/it_IT/process/changes.rst b/Documentation/translations/it_IT/process/changes.rst index 87d081889bfc..dc7193377b7f 100644 --- a/Documentation/translations/it_IT/process/changes.rst +++ b/Documentation/translations/it_IT/process/changes.rst @@ -57,7 +57,7 @@ mcelog 0.6 mcelog --version iptables 1.4.2 iptables -V openssl & libcrypto 1.0.0 openssl version bc 1.06.95 bc --version -Sphinx\ [#f1]_ 1.3 sphinx-build --version +Sphinx\ [#f1]_ 1.7 sphinx-build --version ====================== ================= ======================================== .. [#f1] Sphinx è necessario solo per produrre la documentazione del Kernel diff --git a/Documentation/translations/zh_CN/doc-guide/sphinx.rst b/Documentation/translations/zh_CN/doc-guide/sphinx.rst index 951595c7d599..23eac67fbc30 100644 --- a/Documentation/translations/zh_CN/doc-guide/sphinx.rst +++ b/Documentation/translations/zh_CN/doc-guide/sphinx.rst @@ -26,7 +26,7 @@ reStructuredText文件可能包含包含来自源文件的结构化文档注释 安装Sphinx ========== -Documentation/ 下的ReST文件现在使用sphinx1.3或更高版本构建。 +Documentation/ 下的ReST文件现在使用sphinx1.7或更高版本构建。 这有一个脚本可以检查Sphinx的依赖项。更多详细信息见 :ref:`sphinx-pre-install_zh` 。 @@ -40,22 +40,19 @@ Documentation/ 下的ReST文件现在使用sphinx1.3或更高版本构建。 .. note:: - #) 低于1.5版本的Sphinx无法与Python的0.13.1或更高版本docutils一起正常工作。 - 如果您想使用这些版本,那么应该运行 ``pip install 'docutils==0.12'`` 。 - #) html输出建议使用RTD主题。根据Sphinx版本的不同,它应该用 ``pip install sphinx_rtd_theme`` 单独安装。 #) 一些ReST页面包含数学表达式。由于Sphinx的工作方式,这些表达式是使用 LaTeX 编写的。它需要安装amsfonts和amsmath宏包,以便显示。 -总之,如您要安装Sphinx 1.7.9版本,应执行:: +总之,如您要安装Sphinx 2.4.4版本,应执行:: - $ virtualenv sphinx_1.7.9 - $ . sphinx_1.7.9/bin/activate - (sphinx_1.7.9) $ pip install -r Documentation/sphinx/requirements.txt + $ virtualenv sphinx_2.4.4 + $ . sphinx_2.4.4/bin/activate + (sphinx_2.4.4) $ pip install -r Documentation/sphinx/requirements.txt -在运行 ``. sphinx_1.7.9/bin/activate`` 之后,提示符将变化,以指示您正在使用新 +在运行 ``. sphinx_2.4.4/bin/activate`` 之后,提示符将变化,以指示您正在使用新 环境。如果您打开了一个新的shell,那么在构建文档之前,您需要重新运行此命令以再 次进入虚拟环境中。 @@ -71,7 +68,7 @@ Documentation/ 下的ReST文件现在使用sphinx1.3或更高版本构建。 PDF和LaTeX构建 -------------- -目前只有Sphinx 1.4及更高版本才支持这种构建。 +目前只有Sphinx 2.4及更高版本才支持这种构建。 对于PDF和LaTeX输出,还需要 ``XeLaTeX`` 3.14159265版本。(译注:此版本号真实 存在) @@ -93,8 +90,8 @@ PDF和LaTeX构建 You should run: sudo dnf install -y texlive-luatex85 - /usr/bin/virtualenv sphinx_1.7.9 - . sphinx_1.7.9/bin/activate + /usr/bin/virtualenv sphinx_2.4.4 + . sphinx_2.4.4/bin/activate pip install -r Documentation/sphinx/requirements.txt Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468. From de80e6c51e5044cd8581a5e2d79509d75da55de9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 8 Oct 2021 18:01:03 +0200 Subject: [PATCH 043/336] Documentation: arm: marvell: Add some links to homepage / product infos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Webarchive contains some useful resources like product info or links to other documents. Signed-off-by: Pali Rohár Signed-off-by: Jonathan Corbet --- Documentation/arm/marvell.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/arm/marvell.rst b/Documentation/arm/marvell.rst index 8323c79d321b..d4b5500a0b23 100644 --- a/Documentation/arm/marvell.rst +++ b/Documentation/arm/marvell.rst @@ -104,6 +104,8 @@ Discovery family Not supported by the Linux kernel. + Homepage: + https://web.archive.org/web/20110924171043/http://www.marvell.com/embedded-processors/discovery-innovation/ Core: Feroceon 88fr571-vd ARMv5 compatible @@ -120,6 +122,7 @@ EBU Armada family - 88F6707 - 88F6W11 + - Product infos: https://web.archive.org/web/20141002083258/http://www.marvell.com/embedded-processors/armada-370/ - Product Brief: https://web.archive.org/web/20121115063038/http://www.marvell.com/embedded-processors/armada-300/assets/Marvell_ARMADA_370_SoC.pdf - Hardware Spec: https://web.archive.org/web/20140617183747/http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-datasheet.pdf - Functional Spec: https://web.archive.org/web/20140617183701/http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-FunctionalSpec-datasheet.pdf @@ -130,6 +133,7 @@ EBU Armada family Armada 375 Flavors: - 88F6720 + - Product infos: https://web.archive.org/web/20140108032402/http://www.marvell.com/embedded-processors/armada-375/ - Product Brief: https://web.archive.org/web/20131216023516/http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA_375_SoC-01_product_brief.pdf Core: @@ -170,6 +174,9 @@ EBU Armada family NOTE: not to be confused with the non-SMP 78xx0 SoCs + Product infos: + https://web.archive.org/web/20150101215721/http://www.marvell.com/embedded-processors/armada-xp/ + Product Brief: https://web.archive.org/web/20121021173528/http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf From b3dda08c3304f977f279bc779c1695ee4b6d1101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 8 Oct 2021 18:01:04 +0200 Subject: [PATCH 044/336] Documentation: arm: marvell: Put Armada XP section between Armada 370 and 375 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From evolution and feature point of view Armada XP belongs between Armada 370 and Armada 375 families. Signed-off-by: Pali Rohár Signed-off-by: Jonathan Corbet --- Documentation/arm/marvell.rst | 45 +++++++++++++++-------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/Documentation/arm/marvell.rst b/Documentation/arm/marvell.rst index d4b5500a0b23..0fb61e1e2c43 100644 --- a/Documentation/arm/marvell.rst +++ b/Documentation/arm/marvell.rst @@ -130,6 +130,25 @@ EBU Armada family Core: Sheeva ARMv7 compatible PJ4B + Armada XP Flavors: + - MV78230 + - MV78260 + - MV78460 + + NOTE: + not to be confused with the non-SMP 78xx0 SoCs + + - Product infos: https://web.archive.org/web/20150101215721/http://www.marvell.com/embedded-processors/armada-xp/ + - Product Brief: https://web.archive.org/web/20121021173528/http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf + - Functional Spec: https://web.archive.org/web/20180829171131/http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf + - Hardware Specs: + - https://web.archive.org/web/20141127013651/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF + - https://web.archive.org/web/20141222000224/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF + - https://web.archive.org/web/20141222000230/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF + + Core: + Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP + Armada 375 Flavors: - 88F6720 @@ -166,32 +185,6 @@ EBU Armada family Core: ARM Cortex-A9 - Armada XP Flavors: - - MV78230 - - MV78260 - - MV78460 - - NOTE: - not to be confused with the non-SMP 78xx0 SoCs - - Product infos: - https://web.archive.org/web/20150101215721/http://www.marvell.com/embedded-processors/armada-xp/ - - Product Brief: - https://web.archive.org/web/20121021173528/http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf - - Functional Spec: - https://web.archive.org/web/20180829171131/http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf - - - Hardware Specs: - - - https://web.archive.org/web/20141127013651/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF - - https://web.archive.org/web/20141222000224/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF - - https://web.archive.org/web/20141222000230/http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF - - Core: - Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP - Linux kernel mach directory: arch/arm/mach-mvebu Linux kernel plat directory: From 738943fab84852261136dbf6459e41ed661634a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Fri, 8 Oct 2021 18:01:05 +0200 Subject: [PATCH 045/336] Documentation: arm: marvell: Fix link to armada_1000_pb.pdf document MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit File armada_1000_pb.pdf is not available on Marvell website anymore. So update link to webarchive where is backup copy. Signed-off-by: Pali Rohár Signed-off-by: Jonathan Corbet --- Documentation/arm/marvell.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arm/marvell.rst b/Documentation/arm/marvell.rst index 0fb61e1e2c43..9485a5a2e2e9 100644 --- a/Documentation/arm/marvell.rst +++ b/Documentation/arm/marvell.rst @@ -436,7 +436,7 @@ Berlin family (Multimedia Solutions) - Flavors: - 88DE3010, Armada 1000 (no Linux support) - Core: Marvell PJ1 (ARMv5TE), Dual-core - - Product Brief: http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf + - Product Brief: https://web.archive.org/web/20131103162620/http://www.marvell.com/digital-entertainment/assets/armada_1000_pb.pdf - 88DE3005, Armada 1500 Mini - Design name: BG2CD - Core: ARM Cortex-A9, PL310 L2CC From 951e0d00205cd3bbf457d31515aa6a96c5f4d053 Mon Sep 17 00:00:00 2001 From: Zhaoyu Liu Date: Sat, 13 Nov 2021 21:37:34 +0800 Subject: [PATCH 046/336] docs: ftrace: fix the wrong path of tracefs Delete "tracing" due to it has been included in /proc/mounts. Delete "echo nop > $tracefs/tracing/current_tracer", maybe this command is redundant. Signed-off-by: Zhaoyu Liu Acked-by: Steven Rostedt (VMware) Signed-off-by: Jonathan Corbet --- Documentation/trace/ftrace.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 4e5b26f03d5b..b3166c4a7867 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -2442,11 +2442,10 @@ Or this simple script! #!/bin/bash tracefs=`sed -ne 's/^tracefs \(.*\) tracefs.*/\1/p' /proc/mounts` - echo nop > $tracefs/tracing/current_tracer - echo 0 > $tracefs/tracing/tracing_on - echo $$ > $tracefs/tracing/set_ftrace_pid - echo function > $tracefs/tracing/current_tracer - echo 1 > $tracefs/tracing/tracing_on + echo 0 > $tracefs/tracing_on + echo $$ > $tracefs/set_ftrace_pid + echo function > $tracefs/current_tracer + echo 1 > $tracefs/tracing_on exec "$@" From 563fbefed46ae4c1f70cffb8eb54c02df480b2c2 Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Thu, 28 Oct 2021 01:37:22 +0800 Subject: [PATCH 047/336] cfg80211: call cfg80211_stop_ap when switch from P2P_GO type If the userspace tools switch from NL80211_IFTYPE_P2P_GO to NL80211_IFTYPE_ADHOC via send_msg(NL80211_CMD_SET_INTERFACE), it does not call the cleanup cfg80211_stop_ap(), this leads to the initialization of in-use data. For example, this path re-init the sdata->assigned_chanctx_list while it is still an element of assigned_vifs list, and makes that linked list corrupt. Signed-off-by: Nguyen Dinh Phi Reported-by: syzbot+bbf402b783eeb6d908db@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20211027173722.777287-1-phind.uet@gmail.com Cc: stable@vger.kernel.org Fixes: ac800140c20e ("cfg80211: .stop_ap when interface is going down") Signed-off-by: Johannes Berg --- net/wireless/util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/util.c b/net/wireless/util.c index 5ff1f8726faf..41ea65deb6e1 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1046,6 +1046,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, switch (otype) { case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, true); break; case NL80211_IFTYPE_ADHOC: From ce6b69749961426c6d822215ded9e67154e1ad4f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 29 Oct 2021 09:25:39 +0200 Subject: [PATCH 048/336] nl80211: fix radio statistics in survey dump Even if userspace specifies the NL80211_ATTR_SURVEY_RADIO_STATS attribute, we cannot get the statistics because we're not really parsing the incoming attributes properly any more. Fix this by passing the attrbuf to nl80211_prepare_wdev_dump() and filling it there, if given, and using a local version only if no output is desired. Since I'm touching it anyway, make nl80211_prepare_wdev_dump() static. Fixes: 50508d941c18 ("cfg80211: use parallel_ops for genl") Reported-by: Jan Fuchs Signed-off-by: Johannes Berg Tested-by: Sven Eckelmann Link: https://lore.kernel.org/r/20211029092539.2851b4799386.If9736d4575ee79420cbec1bd930181e1d53c7317@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 34 +++++++++++++++++++--------------- net/wireless/nl80211.h | 6 +----- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 81232b73df8f..a27b3b5fa210 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -936,33 +936,37 @@ nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, }; -int nl80211_prepare_wdev_dump(struct netlink_callback *cb, - struct cfg80211_registered_device **rdev, - struct wireless_dev **wdev) +static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct wireless_dev **wdev, + struct nlattr **attrbuf) { int err; if (!cb->args[0]) { - struct nlattr **attrbuf; + struct nlattr **attrbuf_free = NULL; - attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), - GFP_KERNEL); - if (!attrbuf) - return -ENOMEM; + if (!attrbuf) { + attrbuf = kcalloc(NUM_NL80211_ATTR, sizeof(*attrbuf), + GFP_KERNEL); + if (!attrbuf) + return -ENOMEM; + attrbuf_free = attrbuf; + } err = nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, attrbuf, nl80211_fam.maxattr, nl80211_policy, NULL); if (err) { - kfree(attrbuf); + kfree(attrbuf_free); return err; } rtnl_lock(); *wdev = __cfg80211_wdev_from_attrs(NULL, sock_net(cb->skb->sk), attrbuf); - kfree(attrbuf); + kfree(attrbuf_free); if (IS_ERR(*wdev)) { rtnl_unlock(); return PTR_ERR(*wdev); @@ -6197,7 +6201,7 @@ static int nl80211_dump_station(struct sk_buff *skb, int sta_idx = cb->args[2]; int err; - err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL); if (err) return err; /* nl80211_prepare_wdev_dump acquired it in the successful case */ @@ -7092,7 +7096,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, int path_idx = cb->args[2]; int err; - err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL); if (err) return err; /* nl80211_prepare_wdev_dump acquired it in the successful case */ @@ -7292,7 +7296,7 @@ static int nl80211_dump_mpp(struct sk_buff *skb, int path_idx = cb->args[2]; int err; - err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL); if (err) return err; /* nl80211_prepare_wdev_dump acquired it in the successful case */ @@ -9718,7 +9722,7 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct netlink_callback *cb) int start = cb->args[2], idx = 0; int err; - err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); + err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL); if (err) return err; /* nl80211_prepare_wdev_dump acquired it in the successful case */ @@ -9851,7 +9855,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct netlink_callback *cb) if (!attrbuf) return -ENOMEM; - res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev); + res = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, attrbuf); if (res) { kfree(attrbuf); return res; diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a3f387770f1b..d642e3be4ee7 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file - * Copyright (C) 2018, 2020 Intel Corporation + * Copyright (C) 2018, 2020-2021 Intel Corporation */ #ifndef __NET_WIRELESS_NL80211_H #define __NET_WIRELESS_NL80211_H @@ -22,10 +22,6 @@ static inline u64 wdev_id(struct wireless_dev *wdev) ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32); } -int nl80211_prepare_wdev_dump(struct netlink_callback *cb, - struct cfg80211_registered_device **rdev, - struct wireless_dev **wdev); - int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, struct genl_info *info, struct cfg80211_chan_def *chandef); From bb162bb2b4394108c8f055d1b115735331205e28 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Mon, 8 Nov 2021 22:23:51 -0500 Subject: [PATCH 049/336] drm/sun4i: fix unmet dependency on RESET_CONTROLLER for PHY_SUN6I_MIPI_DPHY When PHY_SUN6I_MIPI_DPHY is selected, and RESET_CONTROLLER is not selected, Kbuild gives the following warning: WARNING: unmet direct dependencies detected for PHY_SUN6I_MIPI_DPHY Depends on [n]: (ARCH_SUNXI [=n] || COMPILE_TEST [=y]) && HAS_IOMEM [=y] && COMMON_CLK [=y] && RESET_CONTROLLER [=n] Selected by [y]: - DRM_SUN6I_DSI [=y] && HAS_IOMEM [=y] && DRM_SUN4I [=y] This is because DRM_SUN6I_DSI selects PHY_SUN6I_MIPI_DPHY without selecting or depending on RESET_CONTROLLER, despite PHY_SUN6I_MIPI_DPHY depending on RESET_CONTROLLER. These unmet dependency bugs were detected by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. v2: Fixed indentation to match the rest of the file. Signed-off-by: Julian Braha Acked-by: Jernej Skrabec Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20211109032351.43322-1-julianbraha@gmail.com --- drivers/gpu/drm/sun4i/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/sun4i/Kconfig b/drivers/gpu/drm/sun4i/Kconfig index 5755f0432e77..8c796de53222 100644 --- a/drivers/gpu/drm/sun4i/Kconfig +++ b/drivers/gpu/drm/sun4i/Kconfig @@ -46,6 +46,7 @@ config DRM_SUN6I_DSI default MACH_SUN8I select CRC_CCITT select DRM_MIPI_DSI + select RESET_CONTROLLER select PHY_SUN6I_MIPI_DPHY help Choose this option if you want have an Allwinner SoC with From 232d45277f0a6549d6ca7985bb152a60adac7b43 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 10 Nov 2021 20:02:13 +0800 Subject: [PATCH 050/336] doc/zh_CN: fix a translation error in management-style 'The name of the game' means the most important part of an activity, so we should translate it by the meaning instead of the words. Suggested-by: Xinyong Wang Signed-off-by: Alex Shi Reviewed-by: Yanteng Si Signed-off-by: Jonathan Corbet --- Documentation/translations/zh_CN/process/management-style.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/translations/zh_CN/process/management-style.rst b/Documentation/translations/zh_CN/process/management-style.rst index c6a5bb285797..8053ae474328 100644 --- a/Documentation/translations/zh_CN/process/management-style.rst +++ b/Documentation/translations/zh_CN/process/management-style.rst @@ -36,14 +36,14 @@ Linux内核管理风格 每个人都认为管理者做决定,而且决策很重要。决定越大越痛苦,管理者就必须越高级。 这很明显,但事实并非如此。 -游戏的名字是 **避免** 做出决定。尤其是,如果有人告诉你“选择(a)或(b), +最重要的是 **避免** 做出决定。尤其是,如果有人告诉你“选择(a)或(b), 我们真的需要你来做决定”,你就是陷入麻烦的管理者。你管理的人比你更了解细节, 所以如果他们来找你做技术决策,你完蛋了。你显然没有能力为他们做这个决定。 (推论:如果你管理的人不比你更了解细节,你也会被搞砸,尽管原因完全不同。 也就是说,你的工作是错的,他们应该管理你的才智) -所以游戏的名字是 **避免** 做出决定,至少是那些大而痛苦的决定。做一些小的 +所以最重要的是 **避免** 做出决定,至少是那些大而痛苦的决定。做一些小的 和非结果性的决定是很好的,并且使您看起来好像知道自己在做什么,所以内核管理者 需要做的是将那些大的和痛苦的决定变成那些没有人真正关心的小事情。 From 77dfc2bc0bb4b8376ecd7a430f27a4a8fff6a5a0 Mon Sep 17 00:00:00 2001 From: Xing Song Date: Mon, 1 Nov 2021 10:46:57 +0800 Subject: [PATCH 051/336] mac80211: do not access the IV when it was stripped ieee80211_get_keyid() will return false value if IV has been stripped, such as return 0 for IP/ARP frames due to LLC header, and return -EINVAL for disassociation frames due to its length... etc. Don't try to access it if it's not present. Signed-off-by: Xing Song Link: https://lore.kernel.org/r/20211101024657.143026-1-xing.song@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index fc5c608d02e2..6360c664259f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1952,7 +1952,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) int keyid = rx->sta->ptk_idx; sta_ptk = rcu_dereference(rx->sta->ptk[keyid]); - if (ieee80211_has_protected(fc)) { + if (ieee80211_has_protected(fc) && + !(status->flag & RX_FLAG_IV_STRIPPED)) { cs = rx->sta->cipher_scheme; keyid = ieee80211_get_keyid(rx->skb, cs); From 53b606fa29e321352a105978726b975b42b292a4 Mon Sep 17 00:00:00 2001 From: Wasin Thonkaew Date: Wed, 3 Nov 2021 19:35:04 +0000 Subject: [PATCH 052/336] docs: filesystems: Fix grammatical error "with" to "which" Signed-off-by: Wasin Thonkaew Signed-off-by: Jonathan Corbet --- Documentation/filesystems/autofs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/autofs.rst b/Documentation/filesystems/autofs.rst index 681c6a492bc0..4f490278d22f 100644 --- a/Documentation/filesystems/autofs.rst +++ b/Documentation/filesystems/autofs.rst @@ -35,7 +35,7 @@ This document describes only the kernel module and the interactions required with any user-space program. Subsequent text refers to this as the "automount daemon" or simply "the daemon". -"autofs" is a Linux kernel module with provides the "autofs" +"autofs" is a Linux kernel module which provides the "autofs" filesystem type. Several "autofs" filesystems can be mounted and they can each be managed separately, or all managed by the same daemon. From c033a38a81bc539d6c0db8c5387e0b14d819a0cf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Nov 2021 10:02:04 +0100 Subject: [PATCH 053/336] mac80211: fix radiotap header generation In commit 8c89f7b3d3f2 ("mac80211: Use flex-array for radiotap header bitmap") we accidentally pointed the position to the wrong place, so we overwrite a present bitmap, and thus cause all kinds of trouble. To see the issue, note that the previous code read: pos = (void *)(it_present + 1); The requirement now is that we need to calculate pos via it_optional, to not trigger the compiler hardening checks, as: pos = (void *)&rthdr->it_optional[...]; Rewriting the original expression, we get (obviously, since that just adds "+ x - x" terms): pos = (void *)(it_present + 1 + rthdr->it_optional - rthdr->it_optional) and moving the "+ rthdr->it_optional" outside to be used as an array: pos = (void *)&rthdr->it_optional[it_present + 1 - rthdr->it_optional]; The original is off by one, fix it. Cc: stable@vger.kernel.org Fixes: 8c89f7b3d3f2 ("mac80211: Use flex-array for radiotap header bitmap") Reported-by: Sid Hayn Signed-off-by: Johannes Berg Tested-by: Sid Hayn Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20211109100203.c61007433ed6.I1dade57aba7de9c4f48d68249adbae62636fd98c@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6360c664259f..1b0bd642dc23 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -364,7 +364,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, * the compiler to think we have walked past the end of the * struct member. */ - pos = (void *)&rthdr->it_optional[it_present - rthdr->it_optional]; + pos = (void *)&rthdr->it_optional[it_present + 1 - rthdr->it_optional]; /* the order of the following fields is important */ From f6ab25d41b18f3d26883cb9c20875e1a85c4f05b Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 10 Nov 2021 22:22:01 +0100 Subject: [PATCH 054/336] mac80211: drop check for DONT_REORDER in __ieee80211_select_queue When __ieee80211_select_queue is called, skb->cb has not been cleared yet, which means that info->control.flags can contain garbage. In some cases this leads to IEEE80211_TX_CTRL_DONT_REORDER being set, causing packets marked for other queues to randomly end up in BE instead. This flag only needs to be checked in ieee80211_select_queue_80211, since the radiotap parser is the only piece of code that sets it Fixes: 66d06c84730c ("mac80211: adhere to Tx control flag that prevents frame reordering") Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20211110212201.35452-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/wme.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 9ea6004abe1b..62c6733e0792 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -143,7 +143,6 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct mac80211_qos_map *qos_map; bool qos; @@ -156,7 +155,7 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, else qos = false; - if (!qos || (info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) { + if (!qos) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } From 6dd2360334f3cb3b45fc1b8194c670090474b87c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Nov 2021 13:51:44 +0100 Subject: [PATCH 055/336] mac80211: fix monitor_sdata RCU/locking assertions Since commit a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") we've not only been protecting the pointer to monitor_sdata with the RTNL, but also with the wiphy->mtx. This is relevant in a number of lockdep assertions, e.g. the one we hit in ieee80211_set_monitor_channel(). However, we're now protecting all the assignments/dereferences, even the one in interface iter, with the wiphy->mtx, so switch over the lockdep assertions to that lock. Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20211112135143.cb8e8ceffef3.Iaa210f16f6904c8a7a24954fb3396da0ef86ec08@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 12 ++++++++---- net/mac80211/iface.c | 4 +++- net/mac80211/util.c | 7 ++++--- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e2b791c37591..bd3d3195097f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -80,7 +80,8 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, } /* also validate MU-MIMO change */ - monitor_sdata = rtnl_dereference(local->monitor_sdata); + monitor_sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); if (!monitor_sdata && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) @@ -840,7 +841,8 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, mutex_lock(&local->mtx); if (local->use_chanctx) { - sdata = rtnl_dereference(local->monitor_sdata); + sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); if (sdata) { ieee80211_vif_release_channel(sdata); ret = ieee80211_vif_use_channel(sdata, chandef, @@ -2707,7 +2709,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { - sdata = rtnl_dereference(local->monitor_sdata); + sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); if (!sdata) return -EOPNOTSUPP; } @@ -2767,7 +2770,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, mutex_unlock(&local->iflist_mtx); if (has_monitor) { - sdata = rtnl_dereference(local->monitor_sdata); + sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); if (sdata) { sdata->user_power_level = local->user_power_level; if (txp_type != sdata->vif.bss_conf.txpower_type) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9a2145c8192b..20aa5cc31f77 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -588,7 +588,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do */ if (local->suspended) { WARN_ON(local->wowlan); - WARN_ON(rtnl_dereference(local->monitor_sdata)); + WARN_ON(rcu_access_pointer(local->monitor_sdata)); return; } @@ -961,6 +961,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return 0; ASSERT_RTNL(); + lockdep_assert_wiphy(local->hw.wiphy); if (local->monitor_sdata) return 0; @@ -1028,6 +1029,7 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) return; ASSERT_RTNL(); + lockdep_assert_wiphy(local->hw.wiphy); mutex_lock(&local->iflist_mtx); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 39fa2a50385d..43df2f0c5db9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -796,7 +796,7 @@ static void __iterate_interfaces(struct ieee80211_local *local, sdata = rcu_dereference_check(local->monitor_sdata, lockdep_is_held(&local->iflist_mtx) || - lockdep_rtnl_is_held()); + lockdep_is_held(&local->hw.wiphy->mtx)); if (sdata && (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || sdata->flags & IEEE80211_SDATA_IN_DRIVER)) @@ -2381,7 +2381,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) IEEE80211_TPT_LEDTRIG_FL_RADIO, 0); /* add interfaces */ - sdata = rtnl_dereference(local->monitor_sdata); + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); if (sdata) { /* in HW restart it exists already */ WARN_ON(local->resuming); @@ -2426,7 +2426,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) WARN_ON(drv_add_chanctx(local, ctx)); mutex_unlock(&local->chanctx_mtx); - sdata = rtnl_dereference(local->monitor_sdata); + sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); if (sdata && ieee80211_sdata_running(sdata)) ieee80211_assign_chanctx(local, sdata); } From 30f6cf96912b638d0ddfc325204b598f94efddc2 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 13 Nov 2021 07:34:15 +0100 Subject: [PATCH 056/336] mac80211: fix throughput LED trigger The codepaths for rx with decap offload and tx with itxq were not updating the counters for the throughput led trigger. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20211113063415.55147-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/led.h | 8 ++++---- net/mac80211/rx.c | 7 ++++--- net/mac80211/tx.c | 34 +++++++++++++++------------------- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/net/mac80211/led.h b/net/mac80211/led.h index fb3aaa3c5606..b71a1428d883 100644 --- a/net/mac80211/led.h +++ b/net/mac80211/led.h @@ -72,19 +72,19 @@ static inline void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, #endif static inline void -ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, __le16 fc, int bytes) +ieee80211_tpt_led_trig_tx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) + if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->tx_bytes += bytes; #endif } static inline void -ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, __le16 fc, int bytes) +ieee80211_tpt_led_trig_rx(struct ieee80211_local *local, int bytes) { #ifdef CONFIG_MAC80211_LEDS - if (ieee80211_is_data(fc) && atomic_read(&local->tpt_led_active)) + if (atomic_read(&local->tpt_led_active)) local->tpt_led_trigger->rx_bytes += bytes; #endif } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1b0bd642dc23..9541a4c30aca 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4864,6 +4864,7 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_rate *rate = NULL; struct ieee80211_supported_band *sband; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; WARN_ON_ONCE(softirq_count() == 0); @@ -4960,9 +4961,9 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, if (!(status->flag & RX_FLAG_8023)) skb = ieee80211_rx_monitor(local, skb, rate); if (skb) { - ieee80211_tpt_led_trig_rx(local, - ((struct ieee80211_hdr *)skb->data)->frame_control, - skb->len); + if ((status->flag & RX_FLAG_8023) || + ieee80211_is_data_present(hdr->frame_control)) + ieee80211_tpt_led_trig_rx(local, skb->len); if (status->flag & RX_FLAG_8023) __ieee80211_rx_handle_8023(hw, pubsta, skb, list); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a756a197c770..278945e3e08a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1721,21 +1721,19 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, * Returns false if the frame couldn't be transmitted but was queued instead. */ static bool __ieee80211_tx(struct ieee80211_local *local, - struct sk_buff_head *skbs, int led_len, - struct sta_info *sta, bool txpending) + struct sk_buff_head *skbs, struct sta_info *sta, + bool txpending) { struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; struct sk_buff *skb; bool result; - __le16 fc; if (WARN_ON(skb_queue_empty(skbs))) return true; skb = skb_peek(skbs); - fc = ((struct ieee80211_hdr *)skb->data)->frame_control; info = IEEE80211_SKB_CB(skb); sdata = vif_to_sdata(info->control.vif); if (sta && !sta->uploaded) @@ -1769,8 +1767,6 @@ static bool __ieee80211_tx(struct ieee80211_local *local, result = ieee80211_tx_frags(local, vif, sta, skbs, txpending); - ieee80211_tpt_led_trig_tx(local, fc, led_len); - WARN_ON_ONCE(!skb_queue_empty(skbs)); return result; @@ -1920,7 +1916,6 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, ieee80211_tx_result res_prepare; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); bool result = true; - int led_len; if (unlikely(skb->len < 10)) { dev_kfree_skb(skb); @@ -1928,7 +1923,6 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, } /* initialises tx */ - led_len = skb->len; res_prepare = ieee80211_tx_prepare(sdata, &tx, sta, skb); if (unlikely(res_prepare == TX_DROP)) { @@ -1951,8 +1945,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, return true; if (!invoke_tx_handlers_late(&tx)) - result = __ieee80211_tx(local, &tx.skbs, led_len, - tx.sta, txpending); + result = __ieee80211_tx(local, &tx.skbs, tx.sta, txpending); return result; } @@ -4175,6 +4168,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct ieee80211_local *local = sdata->local; struct sta_info *sta; struct sk_buff *next; + int len = skb->len; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -4221,10 +4215,8 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, } } else { /* we cannot process non-linear frames on this path */ - if (skb_linearize(skb)) { - kfree_skb(skb); - goto out; - } + if (skb_linearize(skb)) + goto out_free; /* the frame could be fragmented, software-encrypted, and other * things so we cannot really handle checksum offload with it - @@ -4258,7 +4250,10 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, goto out; out_free: kfree_skb(skb); + len = 0; out: + if (len) + ieee80211_tpt_led_trig_tx(local, len); rcu_read_unlock(); } @@ -4396,8 +4391,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, } static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, int led_len, - struct sta_info *sta, + struct sk_buff *skb, struct sta_info *sta, bool txpending) { struct ieee80211_local *local = sdata->local; @@ -4410,6 +4404,8 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, if (sta) sk_pacing_shift_update(skb->sk, local->hw.tx_sk_pacing_shift); + ieee80211_tpt_led_trig_tx(local, skb->len); + if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; @@ -4498,7 +4494,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, if (key) info->control.hw_key = &key->conf; - ieee80211_tx_8023(sdata, skb, skb->len, sta, false); + ieee80211_tx_8023(sdata, skb, sta, false); return; @@ -4637,7 +4633,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, if (IS_ERR(sta) || (sta && !sta->uploaded)) sta = NULL; - result = ieee80211_tx_8023(sdata, skb, skb->len, sta, true); + result = ieee80211_tx_8023(sdata, skb, sta, true); } else { struct sk_buff_head skbs; @@ -4647,7 +4643,7 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local, hdr = (struct ieee80211_hdr *)skb->data; sta = sta_info_get(sdata, hdr->addr1); - result = __ieee80211_tx(local, &skbs, skb->len, sta, true); + result = __ieee80211_tx(local, &skbs, sta, true); } return result; From 8d48bf8206f77aa8687f0e241e901e5197e52423 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 5 Nov 2021 10:41:51 +0100 Subject: [PATCH 057/336] x86/boot: Pull up cmdline preparation and early param parsing Dan reports that Anjaneya Chagam can no longer use the efi=nosoftreserve kernel command line parameter to suppress "soft reservation" behavior. This is due to the fact that the following call-chain happens at boot: early_reserve_memory |-> efi_memblock_x86_reserve_range |-> efi_fake_memmap_early which does if (!efi_soft_reserve_enabled()) return; and that would have set EFI_MEM_NO_SOFT_RESERVE after having parsed "nosoftreserve". However, parse_early_param() gets called *after* it, leading to the boot cmdline not being taken into account. Therefore, carve out the command line preparation into a separate function which does the early param parsing too. So that it all goes together. And then call that function before early_reserve_memory() so that the params would have been parsed by then. Fixes: 8aa83e6395ce ("x86/setup: Call early_reserve_memory() earlier") Reported-by: Dan Williams Reviewed-by: Dan Williams Signed-off-by: Borislav Petkov Tested-by: Anjaneya Chagam Cc: Link: https://lore.kernel.org/r/e8dd8993c38702ee6dd73b3c11f158617e665607.camel@intel.com --- arch/x86/kernel/setup.c | 66 ++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 49b596db5631..c410be738ae7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -742,6 +742,28 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +static char *prepare_command_line(void) +{ +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + + parse_early_param(); + + return command_line; +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -830,6 +852,23 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + /* + * x86_configure_nx() is called before parse_early_param() (called by + * prepare_command_line()) to detect whether hardware doesn't support + * NX (so that the early EHCI debug console setup can safely call + * set_fixmap()). It may then be called again from within noexec_setup() + * during parsing early parameters to honor the respective command line + * option. + */ + x86_configure_nx(); + + /* + * This parses early params and it needs to run before + * early_reserve_memory() because latter relies on such settings + * supplied as early params. + */ + *cmdline_p = prepare_command_line(); + /* * Do some memory reservations *before* memory is added to memblock, so * memblock allocations won't overwrite it. @@ -863,33 +902,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -#else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } -#endif -#endif - - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - /* - * x86_configure_nx() is called before parse_early_param() to detect - * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. - */ - x86_configure_nx(); - - parse_early_param(); - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux From 8a7eb2d476c6823cd44d8c25a6230a52417d7ef8 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Mon, 1 Nov 2021 23:00:26 +0800 Subject: [PATCH 058/336] Drivers: hv: balloon: Use VMBUS_RING_SIZE() wrapper for dm_ring_size Baihua reported an error when boot an ARM64 guest with PAGE_SIZE=64k and BALLOON is enabled: hv_vmbus: registering driver hv_balloon hv_vmbus: probe failed for device 1eccfd72-4b41-45ef-b73a-4a6e44c12924 (-22) The cause of this is that the ringbuffer size for hv_balloon is not adjusted with VMBUS_RING_SIZE(), which makes the size not large enough for ringbuffers on guest with PAGE_SIZE=64k. Therefore use VMBUS_RING_SIZE() to calculate the ringbuffer size. Note that the old size (20 * 1024) counts a 4k header in the total size, while VMBUS_RING_SIZE() expects the parameter as the payload size, so use 16 * 1024. Cc: # 5.15.x Reported-by: Baihua Lu Signed-off-by: Boqun Feng Tested-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20211101150026.736124-1-boqun.feng@gmail.com Signed-off-by: Wei Liu --- drivers/hv/hv_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 7f11ea07d698..ca873a3b98db 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -480,7 +480,7 @@ module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); static atomic_t trans_id = ATOMIC_INIT(0); -static int dm_ring_size = 20 * 1024; +static int dm_ring_size = VMBUS_RING_SIZE(16 * 1024); /* * Driver specific state. From daf972118c517b91f74ff1731417feb4270625a4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Nov 2021 18:22:38 +0000 Subject: [PATCH 059/336] x86/hyperv: Fix NULL deref in set_hv_tscchange_cb() if Hyper-V setup fails Check for a valid hv_vp_index array prior to derefencing hv_vp_index when setting Hyper-V's TSC change callback. If Hyper-V setup failed in hyperv_init(), the kernel will still report that it's running under Hyper-V, but will have silently disabled nearly all functionality. BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 4 PID: 1 Comm: swapper/0 Not tainted 5.15.0-rc2+ #75 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:set_hv_tscchange_cb+0x15/0xa0 Code: <8b> 04 82 8b 15 12 17 85 01 48 c1 e0 20 48 0d ee 00 01 00 f6 c6 08 ... Call Trace: kvm_arch_init+0x17c/0x280 kvm_init+0x31/0x330 vmx_init+0xba/0x13a do_one_initcall+0x41/0x1c0 kernel_init_freeable+0x1f2/0x23b kernel_init+0x16/0x120 ret_from_fork+0x22/0x30 Fixes: 93286261de1b ("x86/hyperv: Reenlightenment notifications support") Cc: stable@vger.kernel.org Cc: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20211104182239.1302956-2-seanjc@google.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 24f4a06ac46a..7d252a58fbe4 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -177,6 +177,9 @@ void set_hv_tscchange_cb(void (*cb)(void)) return; } + if (!hv_vp_index) + return; + hv_reenlightenment_cb = cb; /* Make sure callback is registered before we write to MSRs */ From f3e613e72f66226b3bea1046c1b864f67a3000a4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Nov 2021 18:22:39 +0000 Subject: [PATCH 060/336] x86/hyperv: Move required MSRs check to initial platform probing Explicitly check for MSR_HYPERCALL and MSR_VP_INDEX support when probing for running as a Hyper-V guest instead of waiting until hyperv_init() to detect the bogus configuration. Add messages to give the admin a heads up that they are likely running on a broken virtual machine setup. At best, silently disabling Hyper-V is confusing and difficult to debug, e.g. the kernel _says_ it's using all these fancy Hyper-V features, but always falls back to the native versions. At worst, the half baked setup will crash/hang the kernel. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Link: https://lore.kernel.org/r/20211104182239.1302956-3-seanjc@google.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 9 +-------- arch/x86/kernel/cpu/mshyperv.c | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 7d252a58fbe4..96eb7db31c8e 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -386,20 +386,13 @@ static void __init hv_get_partition_id(void) */ void __init hyperv_init(void) { - u64 guest_id, required_msrs; + u64 guest_id; union hv_x64_msr_hypercall_contents hypercall_msr; int cpuhp; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; - /* Absolutely required MSRs */ - required_msrs = HV_MSR_HYPERCALL_AVAILABLE | - HV_MSR_VP_INDEX_AVAILABLE; - - if ((ms_hyperv.features & required_msrs) != required_msrs) - return; - if (hv_common_init()) return; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 4794b716ec79..ff55df60228f 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -163,12 +163,22 @@ static uint32_t __init ms_hyperv_platform(void) cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - if (eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12)) - return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX || + memcmp("Microsoft Hv", hyp_signature, 12)) + return 0; - return 0; + /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */ + eax = cpuid_eax(HYPERV_CPUID_FEATURES); + if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) { + pr_warn("x86/hyperv: HYPERCALL MSR not available.\n"); + return 0; + } + if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) { + pr_warn("x86/hyperv: VP_INDEX MSR not available.\n"); + return 0; + } + + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; } static unsigned char hv_get_nmi_reason(void) From 86c3a3e964d910a62eeb277d60b2a60ebefa9feb Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 11 Nov 2021 12:59:16 -0800 Subject: [PATCH 061/336] tipc: use consistent GFP flags Some functions, like tipc_crypto_start use inconsisten GFP flags when allocating memory. The mentioned function use GFP_ATOMIC to to alloc a crypto instance, and then calls alloc_ordered_workqueue() which allocates memory with GFP_KERNEL. tipc_aead_init() function even uses GFP_KERNEL and GFP_ATOMIC interchangeably. No doc comment specifies what context a function is designed to work in, but the flags should at least be consistent within a function. Cc: Jon Maloy Cc: Ying Xue Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: tipc-discussion@lists.sourceforge.net Cc: linux-kernel@vger.kernel.org Signed-off-by: Tadeusz Struk Signed-off-by: David S. Miller --- net/tipc/crypto.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index dc60c32bb70d..e701651f6533 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -524,7 +524,7 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, return -EEXIST; /* Allocate a new AEAD */ - tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); + tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); if (unlikely(!tmp)) return -ENOMEM; @@ -1470,7 +1470,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, return -EEXIST; /* Allocate crypto */ - c = kzalloc(sizeof(*c), GFP_ATOMIC); + c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return -ENOMEM; @@ -1484,7 +1484,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, } /* Allocate statistic structure */ - c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); + c->stats = alloc_percpu(struct tipc_crypto_stats); if (!c->stats) { if (c->wq) destroy_workqueue(c->wq); @@ -2457,7 +2457,7 @@ static void tipc_crypto_work_tx(struct work_struct *work) } /* Lets duplicate it first */ - skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); + skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_KERNEL); rcu_read_unlock(); /* Now, generate new key, initiate & distribute it */ From a31d27fbed5d518734cb60956303eb15089a7634 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 12 Nov 2021 08:56:03 +0100 Subject: [PATCH 062/336] tun: fix bonding active backup with arp monitoring As stated in the bonding doc, trans_start must be set manually for drivers using NETIF_F_LLTX: Drivers that use NETIF_F_LLTX flag must also update netdev_queue->trans_start. If they do not, then the ARP monitor will immediately fail any slaves using that driver, and those slaves will stay down. Link: https://www.kernel.org/doc/html/v5.15/networking/bonding.html#arp-monitor-operation Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/tun.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index fecc9a1d293a..1572878c3403 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1010,6 +1010,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; + struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; @@ -1054,6 +1055,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop; + /* NETIF_F_LLTX requires to do our own update of trans_start */ + queue = netdev_get_tx_queue(dev, txq); + queue->trans_start = jiffies; + /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); From 10a2308ffb8cf262e473eb324fde42ae31b6da04 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 12 Nov 2021 18:16:34 +0800 Subject: [PATCH 063/336] net: Clean up some inconsistent indenting Eliminate the follow smatch warning: ./include/linux/skbuff.h:4229 skb_remcsum_process() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 686a666d073d..c8cb7e697d47 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4226,7 +4226,7 @@ static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr, return; } - if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { + if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) { __skb_checksum_complete(skb); skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data); } From f7715b3a349900e3741d9029d02cd1e91b4fc588 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 8 Nov 2021 12:03:52 +0800 Subject: [PATCH 064/336] gpio: virtio: remove unneeded semicolon Eliminate the following coccicheck warning: ./drivers/gpio/gpio-virtio.c:437:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Acked-by: Viresh Kumar Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-virtio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-virtio.c b/drivers/gpio/gpio-virtio.c index aeec4bf0b625..84f96b78f32a 100644 --- a/drivers/gpio/gpio-virtio.c +++ b/drivers/gpio/gpio-virtio.c @@ -434,7 +434,7 @@ static void virtio_gpio_event_vq(struct virtqueue *vq) ret = generic_handle_domain_irq(vgpio->gc.irq.domain, gpio); if (ret) dev_err(dev, "failed to handle interrupt: %d\n", ret); - }; + } } static void virtio_gpio_request_vq(struct virtqueue *vq) From 6e228d8cbb1cc6ba78022d406340e901e08d26e0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 12 Nov 2021 16:22:09 -0600 Subject: [PATCH 065/336] net: ipa: HOLB register sometimes must be written twice Starting with IPA v4.5, the HOL_BLOCK_EN register must be written twice when enabling head-of-line blocking avoidance. Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 5528d97110d5..006da4642a0b 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -868,6 +868,9 @@ ipa_endpoint_init_hol_block_enable(struct ipa_endpoint *endpoint, bool enable) val = enable ? HOL_BLOCK_EN_FMASK : 0; offset = IPA_REG_ENDP_INIT_HOL_BLOCK_EN_N_OFFSET(endpoint_id); iowrite32(val, endpoint->ipa->reg_virt + offset); + /* When enabling, the register must be written twice for IPA v4.5+ */ + if (enable && endpoint->ipa->version >= IPA_VERSION_4_5) + iowrite32(val, endpoint->ipa->reg_virt + offset); } void ipa_endpoint_modem_hol_block_clear_all(struct ipa *ipa) From 816316cacad2b5abd5b41423cf04e4845239abd4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 12 Nov 2021 16:22:10 -0600 Subject: [PATCH 066/336] net: ipa: disable HOLB drop when updating timer The head-of-line blocking timer should only be modified when head-of-line drop is disabled. One of the steps in recovering from a modem crash is to enable dropping of packets with timeout of 0 (immediate). We don't know how the modem configured its endpoints, so before we program the timer, we need to ensure HOL_BLOCK is disabled. Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints") Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 006da4642a0b..ef790fd0ab56 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -853,6 +853,7 @@ static void ipa_endpoint_init_hol_block_timer(struct ipa_endpoint *endpoint, u32 offset; u32 val; + /* This should only be changed when HOL_BLOCK_EN is disabled */ offset = IPA_REG_ENDP_INIT_HOL_BLOCK_TIMER_N_OFFSET(endpoint_id); val = hol_block_timer_val(ipa, microseconds); iowrite32(val, ipa->reg_virt + offset); @@ -883,6 +884,7 @@ void ipa_endpoint_modem_hol_block_clear_all(struct ipa *ipa) if (endpoint->toward_ipa || endpoint->ee_id != GSI_EE_MODEM) continue; + ipa_endpoint_init_hol_block_enable(endpoint, false); ipa_endpoint_init_hol_block_timer(endpoint, 0); ipa_endpoint_init_hol_block_enable(endpoint, true); } From 2153bd1e3d3dbf6a3403572084ef6ed31c53c5f0 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Sat, 13 Nov 2021 15:33:35 +0800 Subject: [PATCH 067/336] net/smc: Transfer remaining wait queue entries during fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SMC fallback is incomplete currently. There may be some wait queue entries remaining in smc socket->wq, which should be removed to clcsocket->wq during the fallback. For example, in nginx/wrk benchmark, this issue causes an all-zeros test result: server: nginx -g 'daemon off;' client: smc_run wrk -c 1 -t 1 -d 5 http://11.200.15.93/index.html Running 5s test @ http://11.200.15.93/index.html 1 threads and 1 connections Thread Stats Avg Stdev Max ± Stdev Latency 0.00us 0.00us 0.00us -nan% Req/Sec 0.00 0.00 0.00 -nan% 0 requests in 5.00s, 0.00B read Requests/sec: 0.00 Transfer/sec: 0.00B The reason for this all-zeros result is that when wrk used SMC to replace TCP, it added an eppoll_entry into smc socket->wq and expected to be notified if epoll events like EPOLL_IN/ EPOLL_OUT occurred on the smc socket. However, once a fallback occurred, wrk switches to use clcsocket. Now it is clcsocket->wq instead of smc socket->wq which will be woken up. The eppoll_entry remaining in smc socket->wq does not work anymore and wrk stops the test. This patch fixes this issue by removing remaining wait queue entries from smc socket->wq to clcsocket->wq during the fallback. Link: https://www.spinics.net/lists/netdev/msg779769.html Signed-off-by: Wen Gu Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 59284da9116d..b61c802e3bf3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -566,6 +566,10 @@ static void smc_stat_fallback(struct smc_sock *smc) static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { + wait_queue_head_t *smc_wait = sk_sleep(&smc->sk); + wait_queue_head_t *clc_wait = sk_sleep(smc->clcsock->sk); + unsigned long flags; + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -575,6 +579,16 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + + /* There may be some entries remaining in + * smc socket->wq, which should be removed + * to clcsocket->wq during the fallback. + */ + spin_lock_irqsave(&smc_wait->lock, flags); + spin_lock(&clc_wait->lock); + list_splice_init(&smc_wait->head, &clc_wait->head); + spin_unlock(&clc_wait->lock); + spin_unlock_irqrestore(&smc_wait->lock, flags); } } From 92a59d7f381d2caf69385bfa00590028e32eea26 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 8 Nov 2021 09:28:49 +0800 Subject: [PATCH 068/336] selftests: gpio: fix gpio compiling error The gpio selftests build against the system includes rather than the headers from the linux tree. This results in the compile failing if the system includes are outdated. Prefer the headers from the linux tree, as per other selftests. Fixes: 8bc395a6a2e2 ("selftests: gpio: rework and simplify test implementation") Reported-by: kernel test robot Signed-off-by: Li Zhijian [Kent: reworded commit comment and added Fixes:] Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- tools/testing/selftests/gpio/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 39f2bbe8dd3d..42ea7d2aa844 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -3,5 +3,6 @@ TEST_PROGS := gpio-mockup.sh TEST_FILES := gpio-mockup-sysfs.sh TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev +CFLAGS += -I../../../../usr/include include ../lib.mk From c472d71be0be0f0ca0fbb794dce83bd76623ee2f Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 8 Nov 2021 09:28:50 +0800 Subject: [PATCH 069/336] selftests: gpio: fix uninitialised variable warning When compiled with -Wall gpio-mockup-cdev.c reports an uninitialised variable warning. This is a false positive, as the variable is ignored in the case it is uninitialised, but initialise the variable anyway to remove the warning. Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- tools/testing/selftests/gpio/gpio-mockup-cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/gpio/gpio-mockup-cdev.c b/tools/testing/selftests/gpio/gpio-mockup-cdev.c index e83eac71621a..d1640f44f8ac 100644 --- a/tools/testing/selftests/gpio/gpio-mockup-cdev.c +++ b/tools/testing/selftests/gpio/gpio-mockup-cdev.c @@ -117,7 +117,7 @@ int main(int argc, char *argv[]) { char *chip; int opt, ret, cfd, lfd; - unsigned int offset, val, abiv; + unsigned int offset, val = 0, abiv; uint32_t flags_v1; uint64_t flags_v2; From 4f4d0af7b2d997635b08fabd748673eff1bb12d6 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 8 Nov 2021 09:28:51 +0800 Subject: [PATCH 070/336] selftests: gpio: restore CFLAGS options All the CFLAGS options were incorrectly removed in the recent rework of the GPIO selftests. While some of the flags were specific to the old implementation the remainder are still relevant. Restore those options. Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- tools/testing/selftests/gpio/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile index 42ea7d2aa844..d7b312b44a62 100644 --- a/tools/testing/selftests/gpio/Makefile +++ b/tools/testing/selftests/gpio/Makefile @@ -3,6 +3,6 @@ TEST_PROGS := gpio-mockup.sh TEST_FILES := gpio-mockup-sysfs.sh TEST_GEN_PROGS_EXTENDED := gpio-mockup-cdev -CFLAGS += -I../../../../usr/include +CFLAGS += -O2 -g -Wall -I../../../../usr/include/ include ../lib.mk From f8885ac89ce310570e5391fe0bf0ec9c7c9b4fdc Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Sun, 14 Nov 2021 01:36:36 +0300 Subject: [PATCH 071/336] net: bnx2x: fix variable dereferenced before check Smatch says: bnx2x_init_ops.h:640 bnx2x_ilt_client_mem_op() warn: variable dereferenced before check 'ilt' (see line 638) Move ilt_cli variable initialization _after_ ilt validation, because it's unsafe to deref the pointer before validation check. Fixes: 523224a3b3cd ("bnx2x, cnic, bnx2i: use new FW/HSI") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h index 1835d2e451c0..fc7fce642666 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init_ops.h @@ -635,11 +635,13 @@ static int bnx2x_ilt_client_mem_op(struct bnx2x *bp, int cli_num, { int i, rc; struct bnx2x_ilt *ilt = BP_ILT(bp); - struct ilt_client_info *ilt_cli = &ilt->clients[cli_num]; + struct ilt_client_info *ilt_cli; if (!ilt || !ilt->lines) return -1; + ilt_cli = &ilt->clients[cli_num]; + if (ilt_cli->flags & (ILT_CLIENT_SKIP_INIT | ILT_CLIENT_SKIP_MEM)) return 0; From b922f622592af76b57cbc566eaeccda0b31a3496 Mon Sep 17 00:00:00 2001 From: Zekun Shen Date: Sat, 13 Nov 2021 22:24:40 -0500 Subject: [PATCH 072/336] atlantic: Fix OOB read and write in hw_atl_utils_fw_rpc_wait This bug report shows up when running our research tools. The reports is SOOB read, but it seems SOOB write is also possible a few lines below. In details, fw.len and sw.len are inputs coming from io. A len over the size of self->rpc triggers SOOB. The patch fixes the bugs by adding sanity checks. The bugs are triggerable with compromised/malfunctioning devices. They are potentially exploitable given they first leak up to 0xffff bytes and able to overwrite the region later. The patch is tested with QEMU emulater. This is NOT tested with a real device. Attached is the log we found by fuzzing. BUG: KASAN: slab-out-of-bounds in hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] Read of size 4 at addr ffff888016260b08 by task modprobe/213 CPU: 0 PID: 213 Comm: modprobe Not tainted 5.6.0 #1 Call Trace: dump_stack+0x76/0xa0 print_address_description.constprop.0+0x16/0x200 ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] __kasan_report.cold+0x37/0x7c ? aq_hw_read_reg_bit+0x60/0x70 [atlantic] ? hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] kasan_report+0xe/0x20 hw_atl_utils_fw_upload_dwords+0x393/0x3c0 [atlantic] hw_atl_utils_fw_rpc_call+0x95/0x130 [atlantic] hw_atl_utils_fw_rpc_wait+0x176/0x210 [atlantic] hw_atl_utils_mpi_create+0x229/0x2e0 [atlantic] ? hw_atl_utils_fw_rpc_wait+0x210/0x210 [atlantic] ? hw_atl_utils_initfw+0x9f/0x1c8 [atlantic] hw_atl_utils_initfw+0x12a/0x1c8 [atlantic] aq_nic_ndev_register+0x88/0x650 [atlantic] ? aq_nic_ndev_init+0x235/0x3c0 [atlantic] aq_pci_probe+0x731/0x9b0 [atlantic] ? aq_pci_func_init+0xc0/0xc0 [atlantic] local_pci_probe+0xd3/0x160 pci_device_probe+0x23f/0x3e0 Reported-by: Brendan Dolan-Gavitt Signed-off-by: Zekun Shen Signed-off-by: David S. Miller --- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index fc0e66006644..3f1704cbe1cb 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -559,6 +559,11 @@ int hw_atl_utils_fw_rpc_wait(struct aq_hw_s *self, goto err_exit; if (fw.len == 0xFFFFU) { + if (sw.len > sizeof(self->rpc)) { + printk(KERN_INFO "Invalid sw len: %x\n", sw.len); + err = -EINVAL; + goto err_exit; + } err = hw_atl_utils_fw_rpc_call(self, sw.len); if (err < 0) goto err_exit; @@ -567,6 +572,11 @@ int hw_atl_utils_fw_rpc_wait(struct aq_hw_s *self, if (rpc) { if (fw.len) { + if (fw.len > sizeof(self->rpc)) { + printk(KERN_INFO "Invalid fw len: %x\n", fw.len); + err = -EINVAL; + goto err_exit; + } err = hw_atl_utils_fw_downld_dwords(self, self->rpc_addr, From a0ddee65c527d877e798205c1391c6170e580c66 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Nov 2021 16:07:49 +0200 Subject: [PATCH 073/336] printk: Remove printk.h inclusion in percpu.h After the commit 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI") the printk.h is not needed anymore in percpu.h. Moreover `make headerdep` complains (an excerpt) In file included from linux/printk.h, from linux/dynamic_debug.h:188 from linux/printk.h:559 <-- here from linux/percpu.h:9 from linux/idr.h:17 include/net/9p/client.h:13: warning: recursive header inclusion Yeah, it's not a root cause of this, but removing will help to reduce the noise. Fixes: 42a0bb3f7138 ("printk/nmi: generic solution for safe printk in NMI") Signed-off-by: Andy Shevchenko Acked-by: Dennis Zhou Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211112140749.80042-1-andriy.shevchenko@linux.intel.com --- include/linux/percpu.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 5e76af742c80..4fa3000f9c22 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include From e97b21e94652f5f0d1c196452c111151f6d15883 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Nov 2021 17:02:29 -0800 Subject: [PATCH 074/336] net: ethernet: lantiq_etop: fix build errors/warnings Fix build error and warnings reported by kernel test robot: drivers/net/ethernet/lantiq_etop.c: In function 'ltq_etop_probe': drivers/net/ethernet/lantiq_etop.c:673:15: error: implicit declaration of function 'device_property_read_u32' [-Werror=implicit-function-declaration] 673 | err = device_property_read_u32(&pdev->dev, "lantiq,tx-burst-length", &priv->tx_burst_len); drivers/net/ethernet/lantiq_etop.c: At top level: drivers/net/ethernet/lantiq_etop.c:730:1: warning: no previous prototype for 'init_ltq_etop' [-Wmissing-prototypes] 730 | init_ltq_etop(void) drivers/net/ethernet/lantiq_etop.c: In function 'ltq_etop_hw_init': drivers/net/ethernet/lantiq_etop.c:276:25: warning: ignoring return value of 'request_irq' declared with attribute 'warn_unused_result' [-Wunused-result] 276 | request_irq(irq, ltq_etop_dma_irq, 0, "etop_tx", priv); drivers/net/ethernet/lantiq_etop.c:284:25: warning: ignoring return value of 'request_irq' declared with attribute 'warn_unused_result' [-Wunused-result] 284 | request_irq(irq, ltq_etop_dma_irq, 0, "etop_rx", priv); Fixes: 14d4e308e0aa ("net: lantiq: configure the burst length in ethernet drivers") Fixes: dddb29e42770 ("net: lantiq_etop: remove deprecated IRQF_DISABLED") Fixes: 504d4721ee8e ("MIPS: Lantiq: Add ethernet driver") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Link: lore.kernel.org/r/202111090621.yjr9xuVj-lkp@intel.com To: netdev@vger.kernel.org Cc: Aleksander Jan Bajkowski Cc: Hauke Mehrtens Cc: "David S. Miller" Cc: Jakub Kicinski Cc: John Crispin Cc: linux-mips@vger.kernel.org Cc: Ralf Baechle Cc: Michael Opdenacker Signed-off-by: David S. Miller --- drivers/net/ethernet/lantiq_etop.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 6433c909c6b2..072391c494ce 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -239,6 +240,7 @@ ltq_etop_hw_init(struct net_device *dev) { struct ltq_etop_priv *priv = netdev_priv(dev); int i; + int err; ltq_pmu_enable(PMU_PPE); @@ -273,7 +275,13 @@ ltq_etop_hw_init(struct net_device *dev) if (IS_TX(i)) { ltq_dma_alloc_tx(&ch->dma); - request_irq(irq, ltq_etop_dma_irq, 0, "etop_tx", priv); + err = request_irq(irq, ltq_etop_dma_irq, 0, "etop_tx", priv); + if (err) { + netdev_err(dev, + "Unable to get Tx DMA IRQ %d\n", + irq); + return err; + } } else if (IS_RX(i)) { ltq_dma_alloc_rx(&ch->dma); for (ch->dma.desc = 0; ch->dma.desc < LTQ_DESC_NUM; @@ -281,7 +289,13 @@ ltq_etop_hw_init(struct net_device *dev) if (ltq_etop_alloc_skb(ch)) return -ENOMEM; ch->dma.desc = 0; - request_irq(irq, ltq_etop_dma_irq, 0, "etop_rx", priv); + err = request_irq(irq, ltq_etop_dma_irq, 0, "etop_rx", priv); + if (err) { + netdev_err(dev, + "Unable to get Rx DMA IRQ %d\n", + irq); + return err; + } } ch->dma.irq = irq; } @@ -726,7 +740,7 @@ static struct platform_driver ltq_mii_driver = { }, }; -int __init +static int __init init_ltq_etop(void) { int ret = platform_driver_probe(<q_mii_driver, ltq_etop_probe); From 46d08f55d24e69e921456b5a40717da09199267b Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Mon, 15 Nov 2021 02:37:59 -0500 Subject: [PATCH 075/336] bnxt_en: extend RTNL to VF check in devlink driver_reinit The fixes the race condition between configuring SR-IOV and devlink reload. The SR-IOV configure logic already takes the RTNL lock, setting sriov_cfg under the lock while changes are underway. Extend the lock scope in devlink driver_reinit to cover the VF check and don't run concurrently with SR-IOV configure. Reported-by: Leon Romanovsky Fixes: 228ea8c187d8 ("bnxt_en: implement devlink dev reload driver_reinit") Cc: Leon Romanovsky Reviewed-by: Somnath Kotur Reviewed-by: Pavan Chebbi Reviewed-by: Andy Gospodarek Signed-off-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 5c464ea73576..a0a9af402642 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -441,12 +441,13 @@ static int bnxt_dl_reload_down(struct devlink *dl, bool netns_change, switch (action) { case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: { - if (BNXT_PF(bp) && bp->pf.active_vfs) { + rtnl_lock(); + if (BNXT_PF(bp) && (bp->pf.active_vfs || bp->sriov_cfg)) { NL_SET_ERR_MSG_MOD(extack, - "reload is unsupported when VFs are allocated"); + "reload is unsupported while VFs are allocated or being configured"); + rtnl_unlock(); return -EOPNOTSUPP; } - rtnl_lock(); if (bp->dev->reg_state == NETREG_UNREGISTERED) { rtnl_unlock(); return -ENODEV; From b68a1a933fe4a52a8316d214e3421f2a89bc113e Mon Sep 17 00:00:00 2001 From: Edwin Peer Date: Mon, 15 Nov 2021 02:38:00 -0500 Subject: [PATCH 076/336] bnxt_en: fix format specifier in live patch error message This fixes type mismatch warning. Reported-by: kernel test robot Fixes: 3c4153394e2c ("bnxt_en: implement firmware live patching") Signed-off-by: Edwin Peer Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index a0a9af402642..6fe9e9b59f83 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -360,7 +360,7 @@ bnxt_dl_livepatch_report_err(struct bnxt *bp, struct netlink_ext_ack *extack, NL_SET_ERR_MSG_MOD(extack, "Live patch already applied"); break; default: - netdev_err(bp->dev, "Unexpected live patch error: %hhd\n", err); + netdev_err(bp->dev, "Unexpected live patch error: %d\n", err); NL_SET_ERR_MSG_MOD(extack, "Failed to activate live patch"); break; } From b0757491a118ae5727cf9f1c3a11544397d46596 Mon Sep 17 00:00:00 2001 From: Sriharsha Basavapatna Date: Mon, 15 Nov 2021 02:38:01 -0500 Subject: [PATCH 077/336] bnxt_en: reject indirect blk offload when hw-tc-offload is off The driver does not check if hw-tc-offload is enabled for the device before offloading a flow in the context of indirect block callback. Fix this by checking NETIF_F_HW_TC in the features flag and rejecting the offload request. This will avoid unnecessary dmesg error logs when hw-tc-offload is disabled, such as these: bnxt_en 0000:19:00.1 eno2np1: dev(ifindex=294) not on same switch bnxt_en 0000:19:00.1 eno2np1: Error: bnxt_tc_add_flow: cookie=0xffff8dace1c88000 error=-22 bnxt_en 0000:19:00.0 eno1np0: dev(ifindex=294) not on same switch bnxt_en 0000:19:00.0 eno1np0: Error: bnxt_tc_add_flow: cookie=0xffff8dace1c88000 error=-22 Reported-by: Marcelo Ricardo Leitner Fixes: 627c89d00fb9 ("bnxt_en: flow_offload: offload tunnel decap rules via indirect callbacks") Signed-off-by: Sriharsha Basavapatna Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index e6a4a768b10b..1471b6130a2b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -1868,7 +1868,7 @@ static int bnxt_tc_setup_indr_block_cb(enum tc_setup_type type, struct flow_cls_offload *flower = type_data; struct bnxt *bp = priv->bp; - if (flower->common.chain_index) + if (!tc_cls_can_offload_and_chain0(bp->dev, type_data)) return -EOPNOTSUPP; switch (type) { From 9119570039481d56350af1c636f040fb300b8cf3 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Mon, 15 Nov 2021 15:04:23 +0800 Subject: [PATCH 078/336] net: stmmac: socfpga: add runtime suspend/resume callback for stratix10 platform According to upstream commit 5ec55823438e("net: stmmac: add clocks management for gmac driver"), it improve clocks management for stmmac driver. So, it is necessary to implement the runtime callback in dwmac-socfpga driver because it doesn't use the common stmmac_pltfr_pm_ops instance. Otherwise, clocks are not disabled when system enters suspend status. Fixes: 5ec55823438e ("net: stmmac: add clocks management for gmac driver") Cc: stable@vger.kernel.org Signed-off-by: Meng Li Signed-off-by: David S. Miller --- .../ethernet/stmicro/stmmac/dwmac-socfpga.c | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 85208128f135..b7c2579c963b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -485,8 +485,28 @@ static int socfpga_dwmac_resume(struct device *dev) } #endif /* CONFIG_PM_SLEEP */ -static SIMPLE_DEV_PM_OPS(socfpga_dwmac_pm_ops, stmmac_suspend, - socfpga_dwmac_resume); +static int __maybe_unused socfpga_dwmac_runtime_suspend(struct device *dev) +{ + struct net_device *ndev = dev_get_drvdata(dev); + struct stmmac_priv *priv = netdev_priv(ndev); + + stmmac_bus_clks_config(priv, false); + + return 0; +} + +static int __maybe_unused socfpga_dwmac_runtime_resume(struct device *dev) +{ + struct net_device *ndev = dev_get_drvdata(dev); + struct stmmac_priv *priv = netdev_priv(ndev); + + return stmmac_bus_clks_config(priv, true); +} + +static const struct dev_pm_ops socfpga_dwmac_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(stmmac_suspend, socfpga_dwmac_resume) + SET_RUNTIME_PM_OPS(socfpga_dwmac_runtime_suspend, socfpga_dwmac_runtime_resume, NULL) +}; static const struct socfpga_dwmac_ops socfpga_gen5_ops = { .set_phy_mode = socfpga_gen5_set_phy_mode, From 6def480181f15f6d9ec812bca8cbc62451ba314c Mon Sep 17 00:00:00 2001 From: liuguoqiang Date: Mon, 15 Nov 2021 16:14:48 +0800 Subject: [PATCH 079/336] net: return correct error code When kmemdup called failed and register_net_sysctl return NULL, should return ENOMEM instead of ENOBUFS Signed-off-by: liuguoqiang Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ec73a0d52d3e..323e622ff9b7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2591,7 +2591,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, free: kfree(t); out: - return -ENOBUFS; + return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, From 271351d255b09e39c7f6437738cba595f9b235be Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Nov 2021 07:45:24 -0500 Subject: [PATCH 080/336] tipc: only accept encrypted MSG_CRYPTO msgs The MSG_CRYPTO msgs are always encrypted and sent to other nodes for keys' deployment. But when receiving in peers, if those nodes do not validate it and make sure it's encrypted, one could craft a malicious MSG_CRYPTO msg to deploy its key with no need to know other nodes' keys. This patch is to do that by checking TIPC_SKB_CB(skb)->decrypted and discard it if this packet never got decrypted. Note that this is also a supplementary fix to CVE-2021-43267 that can be triggered by an unencrypted malicious MSG_CRYPTO msg. Fixes: 1ef6f7c9390f ("tipc: add automatic session key exchange") Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/tipc/link.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 1b7a487c8841..09ae8448f394 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1298,8 +1298,11 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, return false; #ifdef CONFIG_TIPC_CRYPTO case MSG_CRYPTO: - tipc_crypto_msg_rcv(l->net, skb); - return true; + if (TIPC_SKB_CB(skb)->decrypted) { + tipc_crypto_msg_rcv(l->net, skb); + return true; + } + fallthrough; #endif default: pr_warn("Dropping received illegal msg type\n"); From 938cca9e4109b30ee1d476904538225a825e54eb Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 15 Nov 2021 19:16:56 +0900 Subject: [PATCH 081/336] sock: fix /proc/net/sockstat underflow in sk_clone_lock() sk_clone_lock() needs to call sock_inuse_add(1) before entering the sk_free_unlock_clone() error path, for __sk_free() from sk_free() from sk_free_unlock_clone() calls sock_inuse_add(-1). Signed-off-by: Tetsuo Handa Fixes: 648845ab7e200993 ("sock: Move the socket inuse to namespace.") Signed-off-by: David S. Miller --- net/core/sock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 8f2b2f2c0e7b..41e91d0f7061 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2124,8 +2124,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_prot_creator = prot; /* SANITY */ - if (likely(newsk->sk_net_refcnt)) + if (likely(newsk->sk_net_refcnt)) { get_net(sock_net(newsk)); + sock_inuse_add(sock_net(newsk), 1); + } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); @@ -2197,8 +2199,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); - if (likely(newsk->sk_net_refcnt)) - sock_inuse_add(sock_net(newsk), 1); /* Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.rst for details) From cf4f5530bb55ef7d5a91036b26676643b80b1616 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Mon, 15 Nov 2021 17:45:07 +0800 Subject: [PATCH 082/336] net/smc: Make sure the link_id is unique The link_id is supposed to be unique, but smcr_next_link_id() doesn't skip the used link_id as expected. So the patch fixes this. Fixes: 026c381fb477 ("net/smc: introduce link_idx for link group array") Signed-off-by: Wen Gu Reviewed-by: Tony Lu Acked-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 49b8ba3bb683..25ebd30feecd 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -708,13 +708,14 @@ static u8 smcr_next_link_id(struct smc_link_group *lgr) int i; while (1) { +again: link_id = ++lgr->next_link_id; if (!link_id) /* skip zero as link_id */ link_id = ++lgr->next_link_id; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (smc_link_usable(&lgr->lnk[i]) && lgr->lnk[i].link_id == link_id) - continue; + goto again; } break; } From c0019b7db1d7ac62c711cda6b357a659d46428fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 14 Nov 2021 15:16:04 -0500 Subject: [PATCH 083/336] NFSD: Fix exposure in nfsd4_decode_bitmap() rtm@csail.mit.edu reports: > nfsd4_decode_bitmap4() will write beyond bmval[bmlen-1] if the RPC > directs it to do so. This can cause nfsd4_decode_state_protect4_a() > to write client-supplied data beyond the end of > nfsd4_exchange_id.spo_must_allow[] when called by > nfsd4_decode_exchange_id(). Rewrite the loops so nfsd4_decode_bitmap() cannot iterate beyond @bmlen. Reported by: rtm@csail.mit.edu Fixes: d1c263a031e8 ("NFSD: Replace READ* macros in nfsd4_decode_fattr()") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9b609aac47e1..5bdfaa43c99d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -288,11 +288,8 @@ nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen) p = xdr_inline_decode(argp->xdr, count << 2); if (!p) return nfserr_bad_xdr; - i = 0; - while (i < count) - bmval[i++] = be32_to_cpup(p++); - while (i < bmlen) - bmval[i++] = 0; + for (i = 0; i < bmlen; i++) + bmval[i] = (i < count) ? be32_to_cpup(p++) : 0; return nfs_ok; } From 94c4b4fd25e6c3763941bdec3ad54f2204afa992 Mon Sep 17 00:00:00 2001 From: Alistair Delva Date: Mon, 15 Nov 2021 18:16:55 +0000 Subject: [PATCH 084/336] block: Check ADMIN before NICE for IOPRIO_CLASS_RT Booting to Android userspace on 5.14 or newer triggers the following SELinux denial: avc: denied { sys_nice } for comm="init" capability=23 scontext=u:r:init:s0 tcontext=u:r:init:s0 tclass=capability permissive=0 Init is PID 0 running as root, so it already has CAP_SYS_ADMIN. For better compatibility with older SEPolicy, check ADMIN before NICE. Fixes: 9d3a39a5f1e4 ("block: grant IOPRIO_CLASS_RT to CAP_SYS_NICE") Signed-off-by: Alistair Delva Cc: Khazhismel Kumykov Cc: Bart Van Assche Cc: Serge Hallyn Cc: Jens Axboe Cc: Greg Kroah-Hartman Cc: Paul Moore Cc: selinux@vger.kernel.org Cc: linux-security-module@vger.kernel.org Cc: kernel-team@android.com Cc: stable@vger.kernel.org # v5.14+ Reviewed-by: Bart Van Assche Acked-by: Serge Hallyn Link: https://lore.kernel.org/r/20211115181655.3608659-1-adelva@google.com Signed-off-by: Jens Axboe --- block/ioprio.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/ioprio.c b/block/ioprio.c index 0e4ff245f2bf..313c14a70bbd 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -69,7 +69,14 @@ int ioprio_check_cap(int ioprio) switch (class) { case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN)) + /* + * Originally this only checked for CAP_SYS_ADMIN, + * which was implicitly allowed for pid 0 by security + * modules such as SELinux. Make sure we check + * CAP_SYS_ADMIN first to avoid a denial/avc for + * possibly missing CAP_SYS_NICE permission. + */ + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) return -EPERM; fallthrough; /* rt has prio field too */ From 4e5e6b5d9d1334d3490326b6922a2daaf56a867f Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Tue, 9 Feb 2021 11:59:38 +0000 Subject: [PATCH 085/336] iavf: Fix return of set the new channel count Fixed return correct code from set the new channel count. Implemented by check if reset is done in appropriate time. This solution give a extra time to pf for reset vf in case when user want set new channel count for all vfs. Without this patch it is possible to return misleading output code to user and vf reset not to be correctly performed by pf. Fixes: 5520deb15326 ("iavf: Enable support for up to 16 queues") Signed-off-by: Grzegorz Szczurek Signed-off-by: Mateusz Palczewski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 5a359a0a20ec..136c801f5584 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -1776,6 +1776,7 @@ static int iavf_set_channels(struct net_device *netdev, { struct iavf_adapter *adapter = netdev_priv(netdev); u32 num_req = ch->combined_count; + int i; if ((adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_ADQ) && adapter->num_tc) { @@ -1798,6 +1799,20 @@ static int iavf_set_channels(struct net_device *netdev, adapter->num_req_queues = num_req; adapter->flags |= IAVF_FLAG_REINIT_ITR_NEEDED; iavf_schedule_reset(adapter); + + /* wait for the reset is done */ + for (i = 0; i < IAVF_RESET_WAIT_COMPLETE_COUNT; i++) { + msleep(IAVF_RESET_WAIT_MS); + if (adapter->flags & IAVF_FLAG_RESET_PENDING) + continue; + break; + } + if (i == IAVF_RESET_WAIT_COMPLETE_COUNT) { + adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; + adapter->num_active_queues = num_req; + return -EOPNOTSUPP; + } + return 0; } From 8a4a126f4be88eb8b5f00a165ab58c35edf4ef76 Mon Sep 17 00:00:00 2001 From: Nicholas Nunley Date: Fri, 4 Jun 2021 09:48:53 -0700 Subject: [PATCH 086/336] iavf: check for null in iavf_fix_features If the driver has lost contact with the PF then it enters a disabled state and frees adapter->vf_res. However, ndo_fix_features can still be called on the interface, so we need to check for this condition first. Since we have no information on the features at this time simply leave them unmodified and return. Fixes: c4445aedfe09 ("i40evf: Fix VLAN features") Signed-off-by: Nicholas Nunley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 847d67e32a54..561171507cda 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3503,7 +3503,8 @@ static netdev_features_t iavf_fix_features(struct net_device *netdev, { struct iavf_adapter *adapter = netdev_priv(netdev); - if (!(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)) + if (adapter->vf_res && + !(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)) features &= ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER); From 89f22f129696ab53cfbc608e0a2184d0fea46ac1 Mon Sep 17 00:00:00 2001 From: Nicholas Nunley Date: Fri, 4 Jun 2021 09:48:54 -0700 Subject: [PATCH 087/336] iavf: free q_vectors before queues in iavf_disable_vf iavf_free_queues() clears adapter->num_active_queues, which iavf_free_q_vectors() relies on, so swap the order of these two function calls in iavf_disable_vf(). This resolves a panic encountered when the interface is disabled and then later brought up again after PF communication is restored. Fixes: 65c7006f234c ("i40evf: assign num_active_queues inside i40evf_alloc_queues") Signed-off-by: Nicholas Nunley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 561171507cda..c23fff5a4bd9 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2123,8 +2123,8 @@ static void iavf_disable_vf(struct iavf_adapter *adapter) iavf_free_misc_irq(adapter); iavf_reset_interrupt_capability(adapter); - iavf_free_queues(adapter); iavf_free_q_vectors(adapter); + iavf_free_queues(adapter); memset(adapter->vf_res, 0, IAVF_VIRTCHNL_VF_RESOURCE_SIZE); iavf_shutdown_adminq(&adapter->hw); adapter->netdev->flags &= ~IFF_UP; From 2135a8d5c8186bc92901dc00f179ffd50e54c2ac Mon Sep 17 00:00:00 2001 From: Nicholas Nunley Date: Fri, 4 Jun 2021 09:48:55 -0700 Subject: [PATCH 088/336] iavf: don't clear a lock we don't hold In iavf_configure_clsflower() the function will bail out if it is unable to obtain the crit_section lock in a reasonable time. However, it will clear the lock when exiting, so fix this. Fixes: 640a8af5841f ("i40evf: Reorder configure_clsflower to avoid deadlock on error") Signed-off-by: Nicholas Nunley Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index c23fff5a4bd9..28661e4425f1 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3095,8 +3095,10 @@ static int iavf_configure_clsflower(struct iavf_adapter *adapter, return -ENOMEM; while (!mutex_trylock(&adapter->crit_lock)) { - if (--count == 0) - goto err; + if (--count == 0) { + kfree(filter); + return err; + } udelay(1); } From 8905072a192fffe9389255489db250c73ecab008 Mon Sep 17 00:00:00 2001 From: Piotr Marczak Date: Fri, 4 Jun 2021 09:48:56 -0700 Subject: [PATCH 089/336] iavf: Fix failure to exit out from last all-multicast mode The driver could only quit allmulti when allmulti and promisc modes are turn on at the same time. If promisc had been off there was no way to turn off allmulti mode. The patch corrects this behavior. Switching allmulti does not depends on promisc state mode anymore Fixes: f42a5c74da99 ("i40e: Add allmulti support for the VF") Signed-off-by: Piotr Marczak Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 28661e4425f1..76c4ca0f055e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1639,8 +1639,7 @@ static int iavf_process_aq_command(struct iavf_adapter *adapter) iavf_set_promiscuous(adapter, FLAG_VF_MULTICAST_PROMISC); return 0; } - - if ((adapter->aq_required & IAVF_FLAG_AQ_RELEASE_PROMISC) && + if ((adapter->aq_required & IAVF_FLAG_AQ_RELEASE_PROMISC) || (adapter->aq_required & IAVF_FLAG_AQ_RELEASE_ALLMULTI)) { iavf_set_promiscuous(adapter, 0); return 0; From 4f0400803818f2642f066d3eacaf013f23554cc7 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 4 Jun 2021 09:48:57 -0700 Subject: [PATCH 090/336] iavf: prevent accidental free of filter structure In iavf_config_clsflower, the filter structure could be accidentally released at the end, if iavf_parse_cls_flower or iavf_handle_tclass ever return a non-zero but positive value. In this case, the function continues through to the end, and will call kfree() on the filter structure even though it has been added to the linked list. This can actually happen because iavf_parse_cls_flower will return a positive IAVF_ERR_CONFIG value instead of the traditional negative error codes. Fix this by ensuring that the kfree() check and error checks are similar. Use the more idiomatic "if (err)" to catch all non-zero error codes. Fixes: 0075fa0fadd0 ("i40evf: Add support to apply cloud filters") Signed-off-by: Jacob Keller Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 76c4ca0f055e..9c68c8628512 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3108,11 +3108,11 @@ static int iavf_configure_clsflower(struct iavf_adapter *adapter, /* start out with flow type and eth type IPv4 to begin with */ filter->f.flow_type = VIRTCHNL_TCP_V4_FLOW; err = iavf_parse_cls_flower(adapter, cls_flower, filter); - if (err < 0) + if (err) goto err; err = iavf_handle_tclass(adapter, tc, filter); - if (err < 0) + if (err) goto err; /* add filter to the list */ From 131b0edc4028bb88bb472456b1ddba526cfb7036 Mon Sep 17 00:00:00 2001 From: Mitch Williams Date: Fri, 4 Jun 2021 09:48:58 -0700 Subject: [PATCH 091/336] iavf: validate pointers In some cases, the ethtool get_rxfh handler may be called with a null key or indir parameter. So check these pointers, or you will have a very bad day. Fixes: 43a3d9ba34c9 ("i40evf: Allow PF driver to configure RSS") Signed-off-by: Mitch Williams Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 136c801f5584..25ee0606e625 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -1859,14 +1859,13 @@ static int iavf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, if (hfunc) *hfunc = ETH_RSS_HASH_TOP; - if (!indir) - return 0; + if (key) + memcpy(key, adapter->rss_key, adapter->rss_key_size); - memcpy(key, adapter->rss_key, adapter->rss_key_size); - - /* Each 32 bits pointed by 'indir' is stored with a lut entry */ - for (i = 0; i < adapter->rss_lut_size; i++) - indir[i] = (u32)adapter->rss_lut[i]; + if (indir) + /* Each 32 bits pointed by 'indir' is stored with a lut entry */ + for (i = 0; i < adapter->rss_lut_size; i++) + indir[i] = (u32)adapter->rss_lut[i]; return 0; } From 321421b57a12e933f92b228e0e6d0b2c6541f41d Mon Sep 17 00:00:00 2001 From: Surabhi Boob Date: Fri, 4 Jun 2021 09:48:59 -0700 Subject: [PATCH 092/336] iavf: Fix for the false positive ASQ/ARQ errors while issuing VF reset While issuing VF Reset from the guest OS, the VF driver prints logs about critical / Overflow error detection. This is not an actual error since the VF_MBX_ARQLEN register is set to all FF's for a short period of time and the VF would catch the bits set if it was reading the register during that spike of time. This patch introduces an additional check to ignore this condition since the VF is in reset. Fixes: 19b73d8efaa4 ("i40evf: Add additional check for reset") Signed-off-by: Surabhi Boob Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 9c68c8628512..9ca9208aa896 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2409,7 +2409,7 @@ static void iavf_adminq_task(struct work_struct *work) /* check for error indications */ val = rd32(hw, hw->aq.arq.len); - if (val == 0xdeadbeef) /* indicates device in reset */ + if (val == 0xdeadbeef || val == 0xffffffff) /* device in reset */ goto freedom; oldval = val; if (val & IAVF_VF_ARQLEN1_ARQVFE_MASK) { From 9a6e9e483a9684a34573fd9f9e30ecfb047cb8cb Mon Sep 17 00:00:00 2001 From: Grzegorz Szczurek Date: Fri, 4 Jun 2021 09:49:00 -0700 Subject: [PATCH 093/336] iavf: Fix for setting queues to 0 Now setting combine to 0 will be rejected with the appropriate error code. This has been implemented by adding a condition that checks the value of combine equal to zero. Without this patch, when the user requested it, no error was returned and combine was set to the default value for VF. Fixes: 5520deb15326 ("iavf: Enable support for up to 16 queues") Signed-off-by: Grzegorz Szczurek Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 25ee0606e625..144a77679359 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -1787,7 +1787,7 @@ static int iavf_set_channels(struct net_device *netdev, /* All of these should have already been checked by ethtool before this * even gets to us, but just to be sure. */ - if (num_req > adapter->vsi_res->num_queue_pairs) + if (num_req == 0 || num_req > adapter->vsi_res->num_queue_pairs) return -EINVAL; if (num_req == adapter->num_active_queues) From 4293014230b887d94b68aa460ff00153454a3709 Mon Sep 17 00:00:00 2001 From: Akeem G Abodunrin Date: Fri, 4 Jun 2021 09:53:27 -0700 Subject: [PATCH 094/336] iavf: Restore VLAN filters after link down Restore VLAN filters after the link is brought down, and up - since all filters are deleted from HW during the netdev link down routine. Fixes: ed1f5b58ea01 ("i40evf: remove VLAN filters on close") Signed-off-by: Akeem G Abodunrin Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 35 ++++++++++++++++++--- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index e6e7c1da47fb..75635bd57cf6 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -39,6 +39,7 @@ #include "iavf_txrx.h" #include "iavf_fdir.h" #include "iavf_adv_rss.h" +#include #define DEFAULT_DEBUG_LEVEL_SHIFT 3 #define PFX "iavf: " diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 9ca9208aa896..336e6bf95e48 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -696,6 +696,23 @@ static void iavf_del_vlan(struct iavf_adapter *adapter, u16 vlan) spin_unlock_bh(&adapter->mac_vlan_list_lock); } +/** + * iavf_restore_filters + * @adapter: board private structure + * + * Restore existing non MAC filters when VF netdev comes back up + **/ +static void iavf_restore_filters(struct iavf_adapter *adapter) +{ + /* re-add all VLAN filters */ + if (VLAN_ALLOWED(adapter)) { + u16 vid; + + for_each_set_bit(vid, adapter->vsi.active_vlans, VLAN_N_VID) + iavf_add_vlan(adapter, vid); + } +} + /** * iavf_vlan_rx_add_vid - Add a VLAN filter to a device * @netdev: network device struct @@ -709,8 +726,11 @@ static int iavf_vlan_rx_add_vid(struct net_device *netdev, if (!VLAN_ALLOWED(adapter)) return -EIO; + if (iavf_add_vlan(adapter, vid) == NULL) return -ENOMEM; + + set_bit(vid, adapter->vsi.active_vlans); return 0; } @@ -725,11 +745,13 @@ static int iavf_vlan_rx_kill_vid(struct net_device *netdev, { struct iavf_adapter *adapter = netdev_priv(netdev); - if (VLAN_ALLOWED(adapter)) { - iavf_del_vlan(adapter, vid); - return 0; - } - return -EIO; + if (!VLAN_ALLOWED(adapter)) + return -EIO; + + iavf_del_vlan(adapter, vid); + clear_bit(vid, adapter->vsi.active_vlans); + + return 0; } /** @@ -3309,6 +3331,9 @@ static int iavf_open(struct net_device *netdev) spin_unlock_bh(&adapter->mac_vlan_list_lock); + /* Restore VLAN filters that were removed with IFF_DOWN */ + iavf_restore_filters(adapter); + iavf_configure(adapter); iavf_up_complete(adapter); From 95febeb61bf87ca803a1270498cd4cd61554a68f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Nov 2021 14:23:08 -0700 Subject: [PATCH 095/336] block: fix missing queue put in error path If we fail the submission queue checks, we don't put the queue afterwards. This can cause various issues like stalls on scheduler switch or failure to remove the device, or like in the original bug report, timeout waiting for the device on reboot/restart. While in there, fix a few whitespace discrepancies in the surrounding code. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215039 Fixes: b637108a4022 ("blk-mq: fix filesystem I/O request allocation") Reported-and-tested-by: Stephen Smith Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3ab34c4f20da..5e1c9fd99353 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2543,8 +2543,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -static inline bool blk_mq_can_use_cached_rq(struct request *rq, - struct bio *bio) +static inline bool blk_mq_can_use_cached_rq(struct request *rq, struct bio *bio) { if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type) return false; @@ -2565,7 +2564,6 @@ static inline struct request *blk_mq_get_request(struct request_queue *q, bool checked = false; if (plug) { - rq = rq_list_peek(&plug->cached_rq); if (rq && rq->q == q) { if (unlikely(!submit_bio_checks(bio))) @@ -2587,12 +2585,14 @@ static inline struct request *blk_mq_get_request(struct request_queue *q, fallback: if (unlikely(bio_queue_enter(bio))) return NULL; - if (!checked && !submit_bio_checks(bio)) - return NULL; + if (unlikely(!checked && !submit_bio_checks(bio))) + goto out_put; rq = blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq); - if (!rq) - blk_queue_exit(q); - return rq; + if (rq) + return rq; +out_put: + blk_queue_exit(q); + return NULL; } /** From e9380df851878cee71df5a1c7611584421527f7e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sun, 31 Oct 2021 20:48:52 -0500 Subject: [PATCH 096/336] ACPI: Add stubs for wakeup handler functions The commit ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()") added new functions for drivers to use during the s2idle wakeup path, but didn't add stubs for when CONFIG_ACPI wasn't set. Add those stubs in for other drivers to be able to use. Fixes: ddfd9dcf270c ("ACPI: PM: Add acpi_[un]register_wakeup_handler()") Acked-by: Rafael J. Wysocki Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20211101014853.6177-1-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- include/linux/acpi.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 143ce7e0bee1..668d007f0917 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -974,6 +974,15 @@ static inline int acpi_get_local_address(acpi_handle handle, u32 *addr) return -ENODEV; } +static inline int acpi_register_wakeup_handler(int wake_irq, + bool (*wakeup)(void *context), void *context) +{ + return -ENXIO; +} + +static inline void acpi_unregister_wakeup_handler( + bool (*wakeup)(void *context), void *context) { } + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC From 2d54067fcd23aae61e23508425ae5b29e973573d Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sun, 31 Oct 2021 20:48:53 -0500 Subject: [PATCH 097/336] pinctrl: amd: Fix wakeups when IRQ is shared with SCI On some Lenovo AMD Gen2 platforms the IRQ for the SCI and pinctrl drivers are shared. Due to how the s2idle loop handling works, this case needs an extra explicit check whether the interrupt was caused by SCI or by the GPIO controller. To fix this rework the existing IRQ handler function to function as a checker and an IRQ handler depending on the calling arguments. BugLink: https://gitlab.freedesktop.org/drm/amd/-/issues/1738 Reported-by: Joerie de Gram Signed-off-by: Mario Limonciello Acked-by: Basavaraj Natikar Link: https://lore.kernel.org/r/20211101014853.6177-2-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index bae9d429b813..ecab9064a845 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -598,14 +598,14 @@ static struct irq_chip amd_gpio_irqchip = { #define PIN_IRQ_PENDING (BIT(INTERRUPT_STS_OFF) | BIT(WAKE_STS_OFF)) -static irqreturn_t amd_gpio_irq_handler(int irq, void *dev_id) +static bool do_amd_gpio_irq_handler(int irq, void *dev_id) { struct amd_gpio *gpio_dev = dev_id; struct gpio_chip *gc = &gpio_dev->gc; - irqreturn_t ret = IRQ_NONE; unsigned int i, irqnr; unsigned long flags; u32 __iomem *regs; + bool ret = false; u32 regval; u64 status, mask; @@ -627,6 +627,14 @@ static irqreturn_t amd_gpio_irq_handler(int irq, void *dev_id) /* Each status bit covers four pins */ for (i = 0; i < 4; i++) { regval = readl(regs + i); + /* caused wake on resume context for shared IRQ */ + if (irq < 0 && (regval & BIT(WAKE_STS_OFF))) { + dev_dbg(&gpio_dev->pdev->dev, + "Waking due to GPIO %d: 0x%x", + irqnr + i, regval); + return true; + } + if (!(regval & PIN_IRQ_PENDING) || !(regval & BIT(INTERRUPT_MASK_OFF))) continue; @@ -650,9 +658,12 @@ static irqreturn_t amd_gpio_irq_handler(int irq, void *dev_id) } writel(regval, regs + i); raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); - ret = IRQ_HANDLED; + ret = true; } } + /* did not cause wake on resume context for shared IRQ */ + if (irq < 0) + return false; /* Signal EOI to the GPIO unit */ raw_spin_lock_irqsave(&gpio_dev->lock, flags); @@ -664,6 +675,16 @@ static irqreturn_t amd_gpio_irq_handler(int irq, void *dev_id) return ret; } +static irqreturn_t amd_gpio_irq_handler(int irq, void *dev_id) +{ + return IRQ_RETVAL(do_amd_gpio_irq_handler(irq, dev_id)); +} + +static bool __maybe_unused amd_gpio_check_wake(void *dev_id) +{ + return do_amd_gpio_irq_handler(-1, dev_id); +} + static int amd_get_groups_count(struct pinctrl_dev *pctldev) { struct amd_gpio *gpio_dev = pinctrl_dev_get_drvdata(pctldev); @@ -1033,6 +1054,7 @@ static int amd_gpio_probe(struct platform_device *pdev) goto out2; platform_set_drvdata(pdev, gpio_dev); + acpi_register_wakeup_handler(gpio_dev->irq, amd_gpio_check_wake, gpio_dev); dev_dbg(&pdev->dev, "amd gpio driver loaded\n"); return ret; @@ -1050,6 +1072,7 @@ static int amd_gpio_remove(struct platform_device *pdev) gpio_dev = platform_get_drvdata(pdev); gpiochip_remove(&gpio_dev->gc); + acpi_unregister_wakeup_handler(amd_gpio_check_wake, gpio_dev); return 0; } From 55924812d208a8a27a6690db2bbba357bd2773f0 Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Thu, 28 Oct 2021 17:32:35 +0530 Subject: [PATCH 098/336] pinctrl: tegra: Return const pointer from tegra_pinctrl_get_group() Instead of returning const pointer from tegra_pinctrl_get_group() the return value is being casted. This change helps return const pointer. Signed-off-by: Prathamesh Shete Signed-off-by: Linus Walleij --- drivers/pinctrl/tegra/pinctrl-tegra.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/tegra/pinctrl-tegra.c b/drivers/pinctrl/tegra/pinctrl-tegra.c index 8d734bfc33d2..50bd26a30ac0 100644 --- a/drivers/pinctrl/tegra/pinctrl-tegra.c +++ b/drivers/pinctrl/tegra/pinctrl-tegra.c @@ -275,7 +275,7 @@ static int tegra_pinctrl_set_mux(struct pinctrl_dev *pctldev, return 0; } -static struct tegra_pingroup *tegra_pinctrl_get_group(struct pinctrl_dev *pctldev, +static const struct tegra_pingroup *tegra_pinctrl_get_group(struct pinctrl_dev *pctldev, unsigned int offset) { struct tegra_pmx *pmx = pinctrl_dev_get_drvdata(pctldev); @@ -289,7 +289,7 @@ static struct tegra_pingroup *tegra_pinctrl_get_group(struct pinctrl_dev *pctlde continue; for (j = 0; j < num_pins; j++) { if (offset == pins[j]) - return (struct tegra_pingroup *)&pmx->soc->groups[group]; + return &pmx->soc->groups[group]; } } From 60430d4c4eddcdf8eac2bdbec9704f84a436eedf Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Thu, 28 Oct 2021 20:46:10 -0400 Subject: [PATCH 099/336] pinctrl: qcom: fix unmet dependencies on GPIOLIB for GPIOLIB_IRQCHIP When PINCTRL_QCOM_SPMI_PMIC or PINCTRL_QCOM_SSBI_PMIC is selected, and GPIOLIB is not selected, Kbuild gives the following warnings: WARNING: unmet direct dependencies detected for GPIOLIB_IRQCHIP Depends on [n]: GPIOLIB [=n] Selected by [y]: - PINCTRL_QCOM_SPMI_PMIC [=y] && PINCTRL [=y] && (ARCH_QCOM [=n] || COMPILE_TEST [=y]) && OF [=y] && SPMI [=y] WARNING: unmet direct dependencies detected for GPIOLIB_IRQCHIP Depends on [n]: GPIOLIB [=n] Selected by [y]: - PINCTRL_QCOM_SSBI_PMIC [=y] && PINCTRL [=y] && (ARCH_QCOM [=n] || COMPILE_TEST [=y]) && OF [=y] This is because these config options enable GPIOLIB_IRQCHIP without selecting or depending on GPIOLIB, despite GPIOLIB_IRQCHIP depending on GPIOLIB. These unmet dependency bugs were detected by Kismet, a static analysis tool for Kconfig. Please advise if this is not the appropriate solution. Signed-off-by: Julian Braha Link: https://lore.kernel.org/r/20211029004610.35131-1-julianbraha@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index b9191f1abb1c..3e0c00766f59 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -197,6 +197,7 @@ config PINCTRL_QCOM_SPMI_PMIC select PINMUX select PINCONF select GENERIC_PINCONF + select GPIOLIB select GPIOLIB_IRQCHIP select IRQ_DOMAIN_HIERARCHY help @@ -211,6 +212,7 @@ config PINCTRL_QCOM_SSBI_PMIC select PINMUX select PINCONF select GENERIC_PINCONF + select GPIOLIB select GPIOLIB_IRQCHIP select IRQ_DOMAIN_HIERARCHY help From a5b9703fe11cd1d6d7a60102aa2abe686dc1867f Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Sun, 31 Oct 2021 07:40:46 +0100 Subject: [PATCH 100/336] pinctrl: ralink: include 'ralink_regs.h' in 'pinctrl-mt7620.c' mt7620.h, included by pinctrl-mt7620.c, mentions MT762X_SOC_MT7628AN declared in ralink_regs.h. Fixes: 745ec436de72 ("pinctrl: ralink: move MT7620 SoC pinmux config into a new 'pinctrl-mt7620.c' file") Cc: stable@vger.kernel.org Signed-off-by: Luiz Angelo Daros de Luca Signed-off-by: Sergio Paracuellos Link: https://lore.kernel.org/r/20211031064046.13533-1-sergio.paracuellos@gmail.com Signed-off-by: Linus Walleij --- drivers/pinctrl/ralink/pinctrl-mt7620.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/ralink/pinctrl-mt7620.c b/drivers/pinctrl/ralink/pinctrl-mt7620.c index 425d55a2ee19..6853b5b8b0fe 100644 --- a/drivers/pinctrl/ralink/pinctrl-mt7620.c +++ b/drivers/pinctrl/ralink/pinctrl-mt7620.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include From 9b3b94e9eb144ef80363b94c178a9a5b9172d295 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Mon, 1 Nov 2021 16:06:40 +0100 Subject: [PATCH 101/336] pinctrl: apple: Always return valid type in apple_gpio_irq_type apple_gpio_irq_type can possibly return -EINVAL which triggers the following compile error with gcc 9 because the type no longer fits into the mask. drivers/pinctrl/pinctrl-apple-gpio.c: In function 'apple_gpio_irq_set_type': ././include/linux/compiler_types.h:335:38: error: call to '__compiletime_assert_289' declared with attribute error: FIELD_PREP: value too large for the field 335 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ [...] drivers/pinctrl/pinctrl-apple-gpio.c:294:7: note: in expansion of macro 'FIELD_PREP' 294 | FIELD_PREP(REG_GPIOx_MODE, irqtype)); | ^~~~~~~~~~ Fix this by making the return value always valid and instead checking for REG_GPIOx_IN_IRQ_OFF in apple_gpio_irq_set_type and return -EINVAL from there. Fixes: a0f160ffcb83 ("pinctrl: add pinctrl/GPIO driver for Apple SoCs") Signed-off-by: Sven Peter Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20211101150640.46553-1-sven@svenpeter.dev Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-apple-gpio.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-apple-gpio.c b/drivers/pinctrl/pinctrl-apple-gpio.c index 0cc346bfc4c3..a7861079a650 100644 --- a/drivers/pinctrl/pinctrl-apple-gpio.c +++ b/drivers/pinctrl/pinctrl-apple-gpio.c @@ -258,7 +258,7 @@ static void apple_gpio_irq_ack(struct irq_data *data) pctl->base + REG_IRQ(irqgrp, data->hwirq)); } -static int apple_gpio_irq_type(unsigned int type) +static unsigned int apple_gpio_irq_type(unsigned int type) { switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_EDGE_RISING: @@ -272,7 +272,7 @@ static int apple_gpio_irq_type(unsigned int type) case IRQ_TYPE_LEVEL_LOW: return REG_GPIOx_IN_IRQ_LO; default: - return -EINVAL; + return REG_GPIOx_IN_IRQ_OFF; } } @@ -288,7 +288,7 @@ static void apple_gpio_irq_unmask(struct irq_data *data) { struct apple_gpio_pinctrl *pctl = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - int irqtype = apple_gpio_irq_type(irqd_get_trigger_type(data)); + unsigned int irqtype = apple_gpio_irq_type(irqd_get_trigger_type(data)); apple_gpio_set_reg(pctl, data->hwirq, REG_GPIOx_MODE, FIELD_PREP(REG_GPIOx_MODE, irqtype)); @@ -313,10 +313,10 @@ static int apple_gpio_irq_set_type(struct irq_data *data, { struct apple_gpio_pinctrl *pctl = gpiochip_get_data(irq_data_get_irq_chip_data(data)); - int irqtype = apple_gpio_irq_type(type); + unsigned int irqtype = apple_gpio_irq_type(type); - if (irqtype < 0) - return irqtype; + if (irqtype == REG_GPIOx_IN_IRQ_OFF) + return -EINVAL; apple_gpio_set_reg(pctl, data->hwirq, REG_GPIOx_MODE, FIELD_PREP(REG_GPIOx_MODE, irqtype)); From 3a3a100473d2f6ebf9bdfe6efedd7e18de724388 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Mon, 1 Nov 2021 22:41:15 -0500 Subject: [PATCH 102/336] pinctrl: qcom: sdm845: Enable dual edge errata It has been observed that dual edge triggered wakeirq GPIOs on SDM845 doesn't trigger interrupts on the falling edge. Enabling wakeirq_dual_edge_errata for SDM845 indicates that the PDC in SDM845 suffers from the same problem described, and worked around, by Doug in 'c3c0c2e18d94 ("pinctrl: qcom: Handle broken/missing PDC dual edge IRQs on sc7180")', so enable the workaround for SDM845 as well. The specific problem seen without this is that gpio-keys does not detect the falling edge of the LID gpio on the Lenovo Yoga C630 and as such consistently reports the LID as closed. Fixes: e35a6ae0eb3a ("pinctrl/msm: Setup GPIO chip in hierarchy") Signed-off-by: Bjorn Andersson Tested-By: Steev Klimaszewski Reviewed-by: Douglas Anderson Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20211102034115.1946036-1-bjorn.andersson@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-sdm845.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pinctrl/qcom/pinctrl-sdm845.c b/drivers/pinctrl/qcom/pinctrl-sdm845.c index c51793f6546f..fdfd7b8f3a76 100644 --- a/drivers/pinctrl/qcom/pinctrl-sdm845.c +++ b/drivers/pinctrl/qcom/pinctrl-sdm845.c @@ -1310,6 +1310,7 @@ static const struct msm_pinctrl_soc_data sdm845_pinctrl = { .ngpios = 151, .wakeirq_map = sdm845_pdc_map, .nwakeirq_map = ARRAY_SIZE(sdm845_pdc_map), + .wakeirq_dual_edge_errata = true, }; static const struct msm_pinctrl_soc_data sdm845_acpi_pinctrl = { From a3143f7822a9eeb38f0e046080ae8f79f6c7122d Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 2 Nov 2021 16:01:58 -0600 Subject: [PATCH 103/336] Remove unused header Commit 6a80b30086b8 ("fmc: Delete the FMC subsystem") removed the last user of , but left the header file behind. Nothing uses this file, delete it now. Cc: Linus Walleij Cc: Alessandro Rubini Signed-off-by: Jonathan Corbet Acked-by: Alessandro Rubini Link: https://lore.kernel.org/r/20211102220203.940290-5-corbet@lwn.net Signed-off-by: Linus Walleij --- include/linux/sdb.h | 160 -------------------------------------------- 1 file changed, 160 deletions(-) delete mode 100644 include/linux/sdb.h diff --git a/include/linux/sdb.h b/include/linux/sdb.h deleted file mode 100644 index a2404a2bbd10..000000000000 --- a/include/linux/sdb.h +++ /dev/null @@ -1,160 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * This is the official version 1.1 of sdb.h - */ -#ifndef __SDB_H__ -#define __SDB_H__ -#ifdef __KERNEL__ -#include -#else -#include -#endif - -/* - * All structures are 64 bytes long and are expected - * to live in an array, one for each interconnect. - * Most fields of the structures are shared among the - * various types, and most-specific fields are at the - * beginning (for alignment reasons, and to keep the - * magic number at the head of the interconnect record - */ - -/* Product, 40 bytes at offset 24, 8-byte aligned - * - * device_id is vendor-assigned; version is device-specific, - * date is hex (e.g 0x20120501), name is UTF-8, blank-filled - * and not terminated with a 0 byte. - */ -struct sdb_product { - uint64_t vendor_id; /* 0x18..0x1f */ - uint32_t device_id; /* 0x20..0x23 */ - uint32_t version; /* 0x24..0x27 */ - uint32_t date; /* 0x28..0x2b */ - uint8_t name[19]; /* 0x2c..0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* - * Component, 56 bytes at offset 8, 8-byte aligned - * - * The address range is first to last, inclusive - * (for example 0x100000 - 0x10ffff) - */ -struct sdb_component { - uint64_t addr_first; /* 0x08..0x0f */ - uint64_t addr_last; /* 0x10..0x17 */ - struct sdb_product product; /* 0x18..0x3f */ -}; - -/* Type of the SDB record */ -enum sdb_record_type { - sdb_type_interconnect = 0x00, - sdb_type_device = 0x01, - sdb_type_bridge = 0x02, - sdb_type_integration = 0x80, - sdb_type_repo_url = 0x81, - sdb_type_synthesis = 0x82, - sdb_type_empty = 0xFF, -}; - -/* Type 0: interconnect (first of the array) - * - * sdb_records is the length of the table including this first - * record, version is 1. The bus type is enumerated later. - */ -#define SDB_MAGIC 0x5344422d /* "SDB-" */ -struct sdb_interconnect { - uint32_t sdb_magic; /* 0x00-0x03 */ - uint16_t sdb_records; /* 0x04-0x05 */ - uint8_t sdb_version; /* 0x06 */ - uint8_t sdb_bus_type; /* 0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 1: device - * - * class is 0 for "custom device", other values are - * to be standardized; ABI version is for the driver, - * bus-specific bits are defined by each bus (see below) - */ -struct sdb_device { - uint16_t abi_class; /* 0x00-0x01 */ - uint8_t abi_ver_major; /* 0x02 */ - uint8_t abi_ver_minor; /* 0x03 */ - uint32_t bus_specific; /* 0x04-0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 2: bridge - * - * child is the address of the nested SDB table - */ -struct sdb_bridge { - uint64_t sdb_child; /* 0x00-0x07 */ - struct sdb_component sdb_component; /* 0x08-0x3f */ -}; - -/* Type 0x80: integration - * - * all types with bit 7 set are meta-information, so - * software can ignore the types it doesn't know. Here we - * just provide product information for an aggregate device - */ -struct sdb_integration { - uint8_t reserved[24]; /* 0x00-0x17 */ - struct sdb_product product; /* 0x08-0x3f */ -}; - -/* Type 0x81: Top module repository url - * - * again, an informative field that software can ignore - */ -struct sdb_repo_url { - uint8_t repo_url[63]; /* 0x00-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* Type 0x82: Synthesis tool information - * - * this informative record - */ -struct sdb_synthesis { - uint8_t syn_name[16]; /* 0x00-0x0f */ - uint8_t commit_id[16]; /* 0x10-0x1f */ - uint8_t tool_name[8]; /* 0x20-0x27 */ - uint32_t tool_version; /* 0x28-0x2b */ - uint32_t date; /* 0x2c-0x2f */ - uint8_t user_name[15]; /* 0x30-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* Type 0xff: empty - * - * this allows keeping empty slots during development, - * so they can be filled later with minimal efforts and - * no misleading description is ever shipped -- hopefully. - * It can also be used to pad a table to a desired length. - */ -struct sdb_empty { - uint8_t reserved[63]; /* 0x00-0x3e */ - uint8_t record_type; /* 0x3f */ -}; - -/* The type of bus, for bus-specific flags */ -enum sdb_bus_type { - sdb_wishbone = 0x00, - sdb_data = 0x01, -}; - -#define SDB_WB_WIDTH_MASK 0x0f -#define SDB_WB_ACCESS8 0x01 -#define SDB_WB_ACCESS16 0x02 -#define SDB_WB_ACCESS32 0x04 -#define SDB_WB_ACCESS64 0x08 -#define SDB_WB_LITTLE_ENDIAN 0x80 - -#define SDB_DATA_READ 0x04 -#define SDB_DATA_WRITE 0x02 -#define SDB_DATA_EXEC 0x01 - -#endif /* __SDB_H__ */ From 293083f877a7d9dba8165cc3bd159cbfcea28fe7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Nov 2021 14:36:39 +0100 Subject: [PATCH 104/336] pinctrl: tegra194: remove duplicate initializer again An earlier bugfix removed a duplicate field initializer in a macro, but it seems that this came back with the following update: drivers/pinctrl/tegra/pinctrl-tegra194.c:1341:28: error: initialized field overwritten [-Werror=override-init] 1341 | .drv_reg = ((r)), \ | ^ drivers/pinctrl/tegra/pinctrl-tegra194.c:1392:41: note: in expansion of macro 'DRV_PINGROUP_ENTRY_Y' 1392 | #define drive_touch_clk_pcc4 DRV_PINGROUP_ENTRY_Y(0x2004, 12, 5, 20, 5, -1, -1, -1, -1, 1) | ^~~~~~~~~~~~~~~~~~~~ drivers/pinctrl/tegra/pinctrl-tegra194.c:1631:17: note: in expansion of macro 'drive_touch_clk_pcc4' 1631 | drive_##pg_name, \ | ^~~~~~ drivers/pinctrl/tegra/pinctrl-tegra194.c:1636:9: note: in expansion of macro 'PINGROUP' 1636 | PINGROUP(touch_clk_pcc4, GP, TOUCH, RSVD2, RSVD3, 0x2000, 1, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N, "vddio_ao"), | ^~~~~~~~ drivers/pinctrl/tegra/pinctrl-tegra194.c:1341:28: note: (near initialization for 'tegra194_groups[0].drv_reg') 1341 | .drv_reg = ((r)), \ | ^ drivers/pinctrl/tegra/pinctrl-tegra194.c:1392:41: note: in expansion of macro 'DRV_PINGROUP_ENTRY_Y' 1392 | #define drive_touch_clk_pcc4 DRV_PINGROUP_ENTRY_Y(0x2004, 12, 5, 20, 5, -1, -1, -1, -1, 1) | ^~~~~~~~~~~~~~~~~~~~ drivers/pinctrl/tegra/pinctrl-tegra194.c:1631:17: note: in expansion of macro 'drive_touch_clk_pcc4' 1631 | drive_##pg_name, \ | ^~~~~~ drivers/pinctrl/tegra/pinctrl-tegra194.c:1636:9: note: in expansion of macro 'PINGROUP' 1636 | PINGROUP(touch_clk_pcc4, GP, TOUCH, RSVD2, RSVD3, 0x2000, 1, Y, -1, -1, 6, 8, -1, 10, 11, 12, N, -1, -1, N, "vddio_ao"), | ^~~~~~~~ Remove it again. Fixes: 613c0826081b ("pinctrl: tegra: Add pinmux support for Tegra194") Fixes: 92cadf68e50a ("pinctrl: tegra: pinctrl-tegra194: Do not initialise field twice") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211104133645.1186968-1-arnd@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/tegra/pinctrl-tegra194.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pinctrl/tegra/pinctrl-tegra194.c b/drivers/pinctrl/tegra/pinctrl-tegra194.c index b4fef9185d88..5c1dfcb46749 100644 --- a/drivers/pinctrl/tegra/pinctrl-tegra194.c +++ b/drivers/pinctrl/tegra/pinctrl-tegra194.c @@ -1387,7 +1387,6 @@ static struct tegra_function tegra194_functions[] = { .schmitt_bit = schmitt_b, \ .drvtype_bit = 13, \ .lpdr_bit = e_lpdr, \ - .drv_reg = -1, \ #define drive_touch_clk_pcc4 DRV_PINGROUP_ENTRY_Y(0x2004, 12, 5, 20, 5, -1, -1, -1, -1, 1) #define drive_uart3_rx_pcc6 DRV_PINGROUP_ENTRY_Y(0x200c, 12, 5, 20, 5, -1, -1, -1, -1, 1) From 62209e805b5c68577602a5803a71d8e2e11ee0d3 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 4 Nov 2021 10:08:35 -0700 Subject: [PATCH 105/336] pinctrl: qcom: sm8350: Correct UFS and SDC offsets The downstream TLMM binding covers a group of TLMM-related hardware blocks, but the upstream binding only captures the particular block related to controlling the TLMM pins from an OS. In the translation of the driver from downstream, the offset of 0x100000 was lost for the UFS and SDC pingroups. Fixes: d5d348a3271f ("pinctrl: qcom: Add SM8350 pinctrl driver") Signed-off-by: Bjorn Andersson Reviewed-by: Vinod Koul Reviewed-by: Vladimir Zapolskiy Link: https://lore.kernel.org/r/20211104170835.1993686-1-bjorn.andersson@linaro.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-sm8350.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pinctrl/qcom/pinctrl-sm8350.c b/drivers/pinctrl/qcom/pinctrl-sm8350.c index 4d8f8636c2b3..1c042d39380c 100644 --- a/drivers/pinctrl/qcom/pinctrl-sm8350.c +++ b/drivers/pinctrl/qcom/pinctrl-sm8350.c @@ -1597,10 +1597,10 @@ static const struct msm_pingroup sm8350_groups[] = { [200] = PINGROUP(200, qdss_gpio, _, _, _, _, _, _, _, _), [201] = PINGROUP(201, _, _, _, _, _, _, _, _, _), [202] = PINGROUP(202, _, _, _, _, _, _, _, _, _), - [203] = UFS_RESET(ufs_reset, 0x1d8000), - [204] = SDC_PINGROUP(sdc2_clk, 0x1cf000, 14, 6), - [205] = SDC_PINGROUP(sdc2_cmd, 0x1cf000, 11, 3), - [206] = SDC_PINGROUP(sdc2_data, 0x1cf000, 9, 0), + [203] = UFS_RESET(ufs_reset, 0xd8000), + [204] = SDC_PINGROUP(sdc2_clk, 0xcf000, 14, 6), + [205] = SDC_PINGROUP(sdc2_cmd, 0xcf000, 11, 3), + [206] = SDC_PINGROUP(sdc2_data, 0xcf000, 9, 0), }; static const struct msm_gpio_wakeirq_map sm8350_pdc_map[] = { From 2a19b28f7929866e1cec92a3619f4de9f2d20005 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 16 Nov 2021 09:43:43 +0800 Subject: [PATCH 106/336] blk-mq: cancel blk-mq dispatch work in both blk_cleanup_queue and disk_release() For avoiding to slow down queue destroy, we don't call blk_mq_quiesce_queue() in blk_cleanup_queue(), instead of delaying to cancel dispatch work in blk_release_queue(). However, this way has caused kernel oops[1], reported by Changhui. The log shows that scsi_device can be freed before running blk_release_queue(), which is expected too since scsi_device is released after the scsi disk is closed and the scsi_device is removed. Fixes the issue by canceling blk-mq dispatch work in both blk_cleanup_queue() and disk_release(): 1) when disk_release() is run, the disk has been closed, and any sync dispatch activities have been done, so canceling dispatch work is enough to quiesce filesystem I/O dispatch activity. 2) in blk_cleanup_queue(), we only focus on passthrough request, and passthrough request is always explicitly allocated & freed by its caller, so once queue is frozen, all sync dispatch activity for passthrough request has been done, then it is enough to just cancel dispatch work for avoiding any dispatch activity. [1] kernel panic log [12622.769416] BUG: kernel NULL pointer dereference, address: 0000000000000300 [12622.777186] #PF: supervisor read access in kernel mode [12622.782918] #PF: error_code(0x0000) - not-present page [12622.788649] PGD 0 P4D 0 [12622.791474] Oops: 0000 [#1] PREEMPT SMP PTI [12622.796138] CPU: 10 PID: 744 Comm: kworker/10:1H Kdump: loaded Not tainted 5.15.0+ #1 [12622.804877] Hardware name: Dell Inc. PowerEdge R730/0H21J3, BIOS 1.5.4 10/002/2015 [12622.813321] Workqueue: kblockd blk_mq_run_work_fn [12622.818572] RIP: 0010:sbitmap_get+0x75/0x190 [12622.823336] Code: 85 80 00 00 00 41 8b 57 08 85 d2 0f 84 b1 00 00 00 45 31 e4 48 63 cd 48 8d 1c 49 48 c1 e3 06 49 03 5f 10 4c 8d 6b 40 83 f0 01 <48> 8b 33 44 89 f2 4c 89 ef 0f b6 c8 e8 fa f3 ff ff 83 f8 ff 75 58 [12622.844290] RSP: 0018:ffffb00a446dbd40 EFLAGS: 00010202 [12622.850120] RAX: 0000000000000001 RBX: 0000000000000300 RCX: 0000000000000004 [12622.858082] RDX: 0000000000000006 RSI: 0000000000000082 RDI: ffffa0b7a2dfe030 [12622.866042] RBP: 0000000000000004 R08: 0000000000000001 R09: ffffa0b742721334 [12622.874003] R10: 0000000000000008 R11: 0000000000000008 R12: 0000000000000000 [12622.881964] R13: 0000000000000340 R14: 0000000000000000 R15: ffffa0b7a2dfe030 [12622.889926] FS: 0000000000000000(0000) GS:ffffa0baafb40000(0000) knlGS:0000000000000000 [12622.898956] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [12622.905367] CR2: 0000000000000300 CR3: 0000000641210001 CR4: 00000000001706e0 [12622.913328] Call Trace: [12622.916055] [12622.918394] scsi_mq_get_budget+0x1a/0x110 [12622.922969] __blk_mq_do_dispatch_sched+0x1d4/0x320 [12622.928404] ? pick_next_task_fair+0x39/0x390 [12622.933268] __blk_mq_sched_dispatch_requests+0xf4/0x140 [12622.939194] blk_mq_sched_dispatch_requests+0x30/0x60 [12622.944829] __blk_mq_run_hw_queue+0x30/0xa0 [12622.949593] process_one_work+0x1e8/0x3c0 [12622.954059] worker_thread+0x50/0x3b0 [12622.958144] ? rescuer_thread+0x370/0x370 [12622.962616] kthread+0x158/0x180 [12622.966218] ? set_kthread_struct+0x40/0x40 [12622.970884] ret_from_fork+0x22/0x30 [12622.974875] [12622.977309] Modules linked in: scsi_debug rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs sunrpc dm_multipath intel_rapl_msr intel_rapl_common dell_wmi_descriptor sb_edac rfkill video x86_pkg_temp_thermal intel_powerclamp dcdbas coretemp kvm_intel kvm mgag200 irqbypass i2c_algo_bit rapl drm_kms_helper ipmi_ssif intel_cstate intel_uncore syscopyarea sysfillrect sysimgblt fb_sys_fops pcspkr cec mei_me lpc_ich mei ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter drm fuse xfs libcrc32c sr_mod cdrom sd_mod t10_pi sg ixgbe ahci libahci crct10dif_pclmul crc32_pclmul crc32c_intel libata megaraid_sas ghash_clmulni_intel tg3 wdat_wdt mdio dca wmi dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_debug] Reported-by: ChanghuiZhong Cc: Christoph Hellwig Cc: "Martin K. Petersen" Cc: Bart Van Assche Cc: linux-scsi@vger.kernel.org Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20211116014343.610501-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- block/blk-mq.c | 13 +++++++++++++ block/blk-mq.h | 2 ++ block/blk-sysfs.c | 10 ---------- block/genhd.c | 2 ++ 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9ee32f85d74e..f0f38ca8e22f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -363,8 +363,10 @@ void blk_cleanup_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_DEAD, q); blk_sync_queue(q); - if (queue_is_mq(q)) + if (queue_is_mq(q)) { + blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); + } /* * In theory, request pool of sched_tags belongs to request queue. diff --git a/block/blk-mq.c b/block/blk-mq.c index 5e1c9fd99353..eecbd7e6fea2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4417,6 +4417,19 @@ unsigned int blk_mq_rq_cpu(struct request *rq) } EXPORT_SYMBOL(blk_mq_rq_cpu); +void blk_mq_cancel_work_sync(struct request_queue *q) +{ + if (queue_is_mq(q)) { + struct blk_mq_hw_ctx *hctx; + int i; + + cancel_delayed_work_sync(&q->requeue_work); + + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); + } +} + static int __init blk_mq_init(void) { int i; diff --git a/block/blk-mq.h b/block/blk-mq.h index 8acfa650f575..afcf9931a489 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -128,6 +128,8 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); +void blk_mq_cancel_work_sync(struct request_queue *q); + void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index cef1f713370b..cd75b0f73dc6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -791,16 +791,6 @@ static void blk_release_queue(struct kobject *kobj) blk_free_queue_stats(q->stats); - if (queue_is_mq(q)) { - struct blk_mq_hw_ctx *hctx; - int i; - - cancel_delayed_work_sync(&q->requeue_work); - - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->run_work); - } - blk_exit_queue(q); blk_queue_free_zone_bitmaps(q); diff --git a/block/genhd.c b/block/genhd.c index c5392cc24d37..30362aeacac4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1111,6 +1111,8 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); + blk_mq_cancel_work_sync(disk->queue); + disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); From 5e0bc3082e2e403ac0753e099c2b01446bb35578 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Nov 2021 18:22:26 +0400 Subject: [PATCH 107/336] bpf: Forbid bpf_ktime_get_coarse_ns and bpf_timer_* in tracing progs Use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in tracing progs may result in locking issues. bpf_ktime_get_coarse_ns() uses ktime_get_coarse_ns() time accessor that isn't safe for any context: ====================================================== WARNING: possible circular locking dependency detected 5.15.0-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.4/14877 is trying to acquire lock: ffffffff8cb30008 (tk_core.seq.seqcount){----}-{0:0}, at: ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255 but task is already holding lock: ffffffff90dbf200 (&obj_hash[i].lock){-.-.}-{2:2}, at: debug_object_deactivate+0x61/0x400 lib/debugobjects.c:735 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&obj_hash[i].lock){-.-.}-{2:2}: lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0xd1/0x120 kernel/locking/spinlock.c:162 __debug_object_init+0xd9/0x1860 lib/debugobjects.c:569 debug_hrtimer_init kernel/time/hrtimer.c:414 [inline] debug_init kernel/time/hrtimer.c:468 [inline] hrtimer_init+0x20/0x40 kernel/time/hrtimer.c:1592 ntp_init_cmos_sync kernel/time/ntp.c:676 [inline] ntp_init+0xa1/0xad kernel/time/ntp.c:1095 timekeeping_init+0x512/0x6bf kernel/time/timekeeping.c:1639 start_kernel+0x267/0x56e init/main.c:1030 secondary_startup_64_no_verify+0xb1/0xbb -> #0 (tk_core.seq.seqcount){----}-{0:0}: check_prev_add kernel/locking/lockdep.c:3051 [inline] check_prevs_add kernel/locking/lockdep.c:3174 [inline] validate_chain+0x1dfb/0x8240 kernel/locking/lockdep.c:3789 __lock_acquire+0x1382/0x2b00 kernel/locking/lockdep.c:5015 lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625 seqcount_lockdep_reader_access+0xfe/0x230 include/linux/seqlock.h:103 ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255 ktime_get_coarse include/linux/timekeeping.h:120 [inline] ktime_get_coarse_ns include/linux/timekeeping.h:126 [inline] ____bpf_ktime_get_coarse_ns kernel/bpf/helpers.c:173 [inline] bpf_ktime_get_coarse_ns+0x7e/0x130 kernel/bpf/helpers.c:171 bpf_prog_a99735ebafdda2f1+0x10/0xb50 bpf_dispatcher_nop_func include/linux/bpf.h:721 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] BPF_PROG_RUN_ARRAY include/linux/bpf.h:1294 [inline] trace_call_bpf+0x2cf/0x5d0 kernel/trace/bpf_trace.c:127 perf_trace_run_bpf_submit+0x7b/0x1d0 kernel/events/core.c:9708 perf_trace_lock+0x37c/0x440 include/trace/events/lock.h:39 trace_lock_release+0x128/0x150 include/trace/events/lock.h:58 lock_release+0x82/0x810 kernel/locking/lockdep.c:5636 __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:149 [inline] _raw_spin_unlock_irqrestore+0x75/0x130 kernel/locking/spinlock.c:194 debug_hrtimer_deactivate kernel/time/hrtimer.c:425 [inline] debug_deactivate kernel/time/hrtimer.c:481 [inline] __run_hrtimer kernel/time/hrtimer.c:1653 [inline] __hrtimer_run_queues+0x2f9/0xa60 kernel/time/hrtimer.c:1749 hrtimer_interrupt+0x3b3/0x1040 kernel/time/hrtimer.c:1811 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1086 [inline] __sysvec_apic_timer_interrupt+0xf9/0x270 arch/x86/kernel/apic/apic.c:1103 sysvec_apic_timer_interrupt+0x8c/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] _raw_spin_unlock_irqrestore+0xd4/0x130 kernel/locking/spinlock.c:194 try_to_wake_up+0x702/0xd20 kernel/sched/core.c:4118 wake_up_process kernel/sched/core.c:4200 [inline] wake_up_q+0x9a/0xf0 kernel/sched/core.c:953 futex_wake+0x50f/0x5b0 kernel/futex/waitwake.c:184 do_futex+0x367/0x560 kernel/futex/syscalls.c:127 __do_sys_futex kernel/futex/syscalls.c:199 [inline] __se_sys_futex+0x401/0x4b0 kernel/futex/syscalls.c:180 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae There is a possible deadlock with bpf_timer_* set of helpers: hrtimer_start() lock_base(); trace_hrtimer...() perf_event() bpf_run() bpf_timer_start() hrtimer_start() lock_base() <- DEADLOCK Forbid use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_PERF_EVENT and BPF_PROG_TYPE_RAW_TRACEPOINT prog types. Fixes: d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Reported-by: syzbot+43fd005b5a1b4d10781e@syzkaller.appspotmail.com Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211113142227.566439-2-me@ubique.spb.ru --- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/helpers.c | 2 -- kernel/bpf/verifier.c | 7 +++++++ kernel/trace/bpf_trace.c | 2 -- net/core/filter.c | 6 ++++++ net/ipv4/bpf_tcp_ca.c | 2 ++ 6 files changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 2ca643af9a54..43eb3501721b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1809,6 +1809,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_new_value_proto; case BPF_FUNC_sysctl_set_new_value: return &bpf_sysctl_set_new_value_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return cgroup_base_func_proto(func_id, prog); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ffd469c217f..649f07623df6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1364,8 +1364,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_ktime_get_coarse_ns: - return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aab7482ed1c3..65d2f93b7030 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11632,6 +11632,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (map_value_has_timer(map)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_timer yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7396488793ff..ae9755037b7e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1111,8 +1111,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_ktime_get_coarse_ns: - return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/net/core/filter.c b/net/core/filter.c index e471c9b09670..6102f093d59a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7162,6 +7162,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } @@ -10327,6 +10329,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } @@ -10833,6 +10837,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 2cf02b4d77fb..4bb9401b0a3f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -205,6 +205,8 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, offsetof(struct tcp_congestion_ops, release)) return &bpf_sk_getsockopt_proto; return NULL; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } From e60e6962c503f337531f80e2752423b5bd885443 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Nov 2021 18:22:27 +0400 Subject: [PATCH 108/336] selftests/bpf: Add tests for restricted helpers This patch adds tests that bpf_ktime_get_coarse_ns(), bpf_timer_* and bpf_spin_lock()/bpf_spin_unlock() helpers are forbidden in tracing progs as their use there may result in various locking issues. Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211113142227.566439-3-me@ubique.spb.ru --- .../bpf/prog_tests/helper_restricted.c | 33 +++ .../bpf/progs/test_helper_restricted.c | 123 +++++++++++ tools/testing/selftests/bpf/test_verifier.c | 46 +++- .../bpf/verifier/helper_restricted.c | 196 ++++++++++++++++++ 4 files changed, 397 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/helper_restricted.c create mode 100644 tools/testing/selftests/bpf/progs/test_helper_restricted.c create mode 100644 tools/testing/selftests/bpf/verifier/helper_restricted.c diff --git a/tools/testing/selftests/bpf/prog_tests/helper_restricted.c b/tools/testing/selftests/bpf/prog_tests/helper_restricted.c new file mode 100644 index 000000000000..e1de5f80c3b2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/helper_restricted.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "test_helper_restricted.skel.h" + +void test_helper_restricted(void) +{ + int prog_i = 0, prog_cnt; + int duration = 0; + + do { + struct test_helper_restricted *test; + int maybeOK; + + test = test_helper_restricted__open(); + if (!ASSERT_OK_PTR(test, "open")) + return; + + prog_cnt = test->skeleton->prog_cnt; + + for (int j = 0; j < prog_cnt; ++j) { + struct bpf_program *prog = *test->skeleton->progs[j].prog; + + maybeOK = bpf_program__set_autoload(prog, prog_i == j); + ASSERT_OK(maybeOK, "set autoload"); + } + + maybeOK = test_helper_restricted__load(test); + CHECK(!maybeOK, test->skeleton->progs[prog_i].name, "helper isn't restricted"); + + test_helper_restricted__destroy(test); + } while (++prog_i < prog_cnt); +} diff --git a/tools/testing/selftests/bpf/progs/test_helper_restricted.c b/tools/testing/selftests/bpf/progs/test_helper_restricted.c new file mode 100644 index 000000000000..68d64c365f90 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_helper_restricted.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +struct timer { + struct bpf_timer t; +}; + +struct lock { + struct bpf_spin_lock l; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct timer); +} timers SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct lock); +} locks SEC(".maps"); + +static int timer_cb(void *map, int *key, struct timer *timer) +{ + return 0; +} + +static void timer_work(void) +{ + struct timer *timer; + const int key = 0; + + timer = bpf_map_lookup_elem(&timers, &key); + if (timer) { + bpf_timer_init(&timer->t, &timers, CLOCK_MONOTONIC); + bpf_timer_set_callback(&timer->t, timer_cb); + bpf_timer_start(&timer->t, 10E9, 0); + bpf_timer_cancel(&timer->t); + } +} + +static void spin_lock_work(void) +{ + const int key = 0; + struct lock *lock; + + lock = bpf_map_lookup_elem(&locks, &key); + if (lock) { + bpf_spin_lock(&lock->l); + bpf_spin_unlock(&lock->l); + } +} + +SEC("raw_tp/sys_enter") +int raw_tp_timer(void *ctx) +{ + timer_work(); + + return 0; +} + +SEC("tp/syscalls/sys_enter_nanosleep") +int tp_timer(void *ctx) +{ + timer_work(); + + return 0; +} + +SEC("kprobe/sys_nanosleep") +int kprobe_timer(void *ctx) +{ + timer_work(); + + return 0; +} + +SEC("perf_event") +int perf_event_timer(void *ctx) +{ + timer_work(); + + return 0; +} + +SEC("raw_tp/sys_enter") +int raw_tp_spin_lock(void *ctx) +{ + spin_lock_work(); + + return 0; +} + +SEC("tp/syscalls/sys_enter_nanosleep") +int tp_spin_lock(void *ctx) +{ + spin_lock_work(); + + return 0; +} + +SEC("kprobe/sys_nanosleep") +int kprobe_spin_lock(void *ctx) +{ + spin_lock_work(); + + return 0; +} + +SEC("perf_event") +int perf_event_spin_lock(void *ctx) +{ + spin_lock_work(); + + return 0; +} + +const char LICENSE[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 25afe423b3f0..465ef3f112c0 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -92,6 +92,7 @@ struct bpf_test { int fixup_map_event_output[MAX_FIXUPS]; int fixup_map_reuseport_array[MAX_FIXUPS]; int fixup_map_ringbuf[MAX_FIXUPS]; + int fixup_map_timer[MAX_FIXUPS]; /* Expected verifier log output for result REJECT or VERBOSE_ACCEPT. * Can be a tab-separated sequence of expected strings. An empty string * means no log verification. @@ -604,8 +605,15 @@ static int create_cgroup_storage(bool percpu) * int cnt; * struct bpf_spin_lock l; * }; + * struct bpf_timer { + * __u64 :64; + * __u64 :64; + * } __attribute__((aligned(8))); + * struct timer { + * struct bpf_timer t; + * }; */ -static const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l"; +static const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l\0bpf_timer\0timer\0t"; static __u32 btf_raw_types[] = { /* int */ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ @@ -616,6 +624,11 @@ static __u32 btf_raw_types[] = { BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8), BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */ BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */ + /* struct bpf_timer */ /* [4] */ + BTF_TYPE_ENC(25, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0), 16), + /* struct timer */ /* [5] */ + BTF_TYPE_ENC(35, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 16), + BTF_MEMBER_ENC(41, 4, 0), /* struct bpf_timer t; */ }; static int load_btf(void) @@ -696,6 +709,29 @@ static int create_sk_storage_map(void) return fd; } +static int create_map_timer(void) +{ + struct bpf_create_map_attr attr = { + .name = "test_map", + .map_type = BPF_MAP_TYPE_ARRAY, + .key_size = 4, + .value_size = 16, + .max_entries = 1, + .btf_key_type_id = 1, + .btf_value_type_id = 5, + }; + int fd, btf_fd; + + btf_fd = load_btf(); + if (btf_fd < 0) + return -1; + attr.btf_fd = btf_fd; + fd = bpf_create_map_xattr(&attr); + if (fd < 0) + printf("Failed to create map with timer\n"); + return fd; +} + static char bpf_vlog[UINT_MAX >> 8]; static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, @@ -722,6 +758,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_map_event_output = test->fixup_map_event_output; int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; int *fixup_map_ringbuf = test->fixup_map_ringbuf; + int *fixup_map_timer = test->fixup_map_timer; if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -907,6 +944,13 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_ringbuf++; } while (*fixup_map_ringbuf); } + if (*fixup_map_timer) { + map_fds[21] = create_map_timer(); + do { + prog[*fixup_map_timer].imm = map_fds[21]; + fixup_map_timer++; + } while (*fixup_map_timer); + } } struct libcap { diff --git a/tools/testing/selftests/bpf/verifier/helper_restricted.c b/tools/testing/selftests/bpf/verifier/helper_restricted.c new file mode 100644 index 000000000000..a067b7098b97 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/helper_restricted.c @@ -0,0 +1,196 @@ +{ + "bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "unknown func bpf_ktime_get_coarse_ns", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_KPROBE, +}, +{ + "bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_TRACEPOINT", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "unknown func bpf_ktime_get_coarse_ns", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_PERF_EVENT", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "unknown func bpf_ktime_get_coarse_ns", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_PERF_EVENT, +}, +{ + "bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ktime_get_coarse_ns), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "unknown func bpf_ktime_get_coarse_ns", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT, +}, +{ + "bpf_timer_init isn restricted in BPF_PROG_TYPE_KPROBE", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_EMIT_CALL(BPF_FUNC_timer_init), + BPF_EXIT_INSN(), + }, + .fixup_map_timer = { 3, 8 }, + .errstr = "tracing progs cannot use bpf_timer yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_KPROBE, +}, +{ + "bpf_timer_init is forbidden in BPF_PROG_TYPE_PERF_EVENT", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_EMIT_CALL(BPF_FUNC_timer_init), + BPF_EXIT_INSN(), + }, + .fixup_map_timer = { 3, 8 }, + .errstr = "tracing progs cannot use bpf_timer yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_PERF_EVENT, +}, +{ + "bpf_timer_init is forbidden in BPF_PROG_TYPE_TRACEPOINT", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_EMIT_CALL(BPF_FUNC_timer_init), + BPF_EXIT_INSN(), + }, + .fixup_map_timer = { 3, 8 }, + .errstr = "tracing progs cannot use bpf_timer yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "bpf_timer_init is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 1), + BPF_EMIT_CALL(BPF_FUNC_timer_init), + BPF_EXIT_INSN(), + }, + .fixup_map_timer = { 3, 8 }, + .errstr = "tracing progs cannot use bpf_timer yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT, +}, +{ + "bpf_spin_lock is forbidden in BPF_PROG_TYPE_KPROBE", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_spin_lock), + BPF_EXIT_INSN(), + }, + .fixup_map_spin_lock = { 3 }, + .errstr = "tracing progs cannot use bpf_spin_lock yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_KPROBE, +}, +{ + "bpf_spin_lock is forbidden in BPF_PROG_TYPE_TRACEPOINT", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_spin_lock), + BPF_EXIT_INSN(), + }, + .fixup_map_spin_lock = { 3 }, + .errstr = "tracing progs cannot use bpf_spin_lock yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, +}, +{ + "bpf_spin_lock is forbidden in BPF_PROG_TYPE_PERF_EVENT", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_spin_lock), + BPF_EXIT_INSN(), + }, + .fixup_map_spin_lock = { 3 }, + .errstr = "tracing progs cannot use bpf_spin_lock yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_PERF_EVENT, +}, +{ + "bpf_spin_lock is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_spin_lock), + BPF_EXIT_INSN(), + }, + .fixup_map_spin_lock = { 3 }, + .errstr = "tracing progs cannot use bpf_spin_lock yet", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT, +}, From 6060a6cb05e3223146a3c30a1977f136da6c85e7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 15 Nov 2021 14:07:41 +0100 Subject: [PATCH 109/336] samples/bpf: Fix build error due to -isystem removal Since recent Kbuild updates we no longer include files from compiler directories. However, samples/bpf/hbm_kern.h hasn't been tuned for this (LLVM 13): CLANG-bpf samples/bpf/hbm_out_kern.o In file included from samples/bpf/hbm_out_kern.c:55: samples/bpf/hbm_kern.h:12:10: fatal error: 'stddef.h' file not found ^~~~~~~~~~ 1 error generated. CLANG-bpf samples/bpf/hbm_edt_kern.o In file included from samples/bpf/hbm_edt_kern.c:53: samples/bpf/hbm_kern.h:12:10: fatal error: 'stddef.h' file not found ^~~~~~~~~~ 1 error generated. It is enough to just drop both stdbool.h and stddef.h from includes to fix those. Fixes: 04e85bbf71c9 ("isystem: delete global -isystem compile option") Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Reviewed-by: Michal Swiatkowski Link: https://lore.kernel.org/bpf/20211115130741.3584-1-alexandr.lobakin@intel.com Signed-off-by: Alexei Starovoitov --- samples/bpf/hbm_kern.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index 722b3fadb467..1752a46a2b05 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -9,8 +9,6 @@ * Include file for sample Host Bandwidth Manager (HBM) BPF programs */ #define KBUILD_MODNAME "foo" -#include -#include #include #include #include From 353050be4c19e102178ccc05988101887c25ae53 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Nov 2021 18:48:08 +0000 Subject: [PATCH 110/336] bpf: Fix toctou on read-only map's constant scalar tracking Commit a23740ec43ba ("bpf: Track contents of read-only maps as scalars") is checking whether maps are read-only both from BPF program side and user space side, and then, given their content is constant, reading out their data via map->ops->map_direct_value_addr() which is then subsequently used as known scalar value for the register, that is, it is marked as __mark_reg_known() with the read value at verification time. Before a23740ec43ba, the register content was marked as an unknown scalar so the verifier could not make any assumptions about the map content. The current implementation however is prone to a TOCTOU race, meaning, the value read as known scalar for the register is not guaranteed to be exactly the same at a later point when the program is executed, and as such, the prior made assumptions of the verifier with regards to the program will be invalid which can cause issues such as OOB access, etc. While the BPF_F_RDONLY_PROG map flag is always fixed and required to be specified at map creation time, the map->frozen property is initially set to false for the map given the map value needs to be populated, e.g. for global data sections. Once complete, the loader "freezes" the map from user space such that no subsequent updates/deletes are possible anymore. For the rest of the lifetime of the map, this freeze one-time trigger cannot be undone anymore after a successful BPF_MAP_FREEZE cmd return. Meaning, any new BPF_* cmd calls which would update/delete map entries will be rejected with -EPERM since map_get_sys_perms() removes the FMODE_CAN_WRITE permission. This also means that pending update/delete map entries must still complete before this guarantee is given. This corner case is not an issue for loaders since they create and prepare such program private map in successive steps. However, a malicious user is able to trigger this TOCTOU race in two different ways: i) via userfaultfd, and ii) via batched updates. For i) userfaultfd is used to expand the competition interval, so that map_update_elem() can modify the contents of the map after map_freeze() and bpf_prog_load() were executed. This works, because userfaultfd halts the parallel thread which triggered a map_update_elem() at the time where we copy key/value from the user buffer and this already passed the FMODE_CAN_WRITE capability test given at that time the map was not "frozen". Then, the main thread performs the map_freeze() and bpf_prog_load(), and once that had completed successfully, the other thread is woken up to complete the pending map_update_elem() which then changes the map content. For ii) the idea of the batched update is similar, meaning, when there are a large number of updates to be processed, it can increase the competition interval between the two. It is therefore possible in practice to modify the contents of the map after executing map_freeze() and bpf_prog_load(). One way to fix both i) and ii) at the same time is to expand the use of the map's map->writecnt. The latter was introduced in fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") and further refined in 1f6cb19be2e2 ("bpf: Prevent re-mmap()'ing BPF map as writable for initially r/o mapping") with the rationale to make a writable mmap()'ing of a map mutually exclusive with read-only freezing. The counter indicates writable mmap() mappings and then prevents/fails the freeze operation. Its semantics can be expanded beyond just mmap() by generally indicating ongoing write phases. This would essentially span any parallel regular and batched flavor of update/delete operation and then also have map_freeze() fail with -EBUSY. For the check_mem_access() in the verifier we expand upon the bpf_map_is_rdonly() check ensuring that all last pending writes have completed via bpf_map_write_active() test. Once the map->frozen is set and bpf_map_write_active() indicates a map->writecnt of 0 only then we are really guaranteed to use the map's data as known constants. For map->frozen being set and pending writes in process of still being completed we fall back to marking that register as unknown scalar so we don't end up making assumptions about it. With this, both TOCTOU reproducers from i) and ii) are fixed. Note that the map->writecnt has been converted into a atomic64 in the fix in order to avoid a double freeze_mutex mutex_{un,}lock() pair when updating map->writecnt in the various map update/delete BPF_* cmd flavors. Spanning the freeze_mutex over entire map update/delete operations in syscall side would not be possible due to then causing everything to be serialized. Similarly, something like synchronize_rcu() after setting map->frozen to wait for update/deletes to complete is not possible either since it would also have to span the user copy which can sleep. On the libbpf side, this won't break d66562fba1ce ("libbpf: Add BPF object skeleton support") as the anonymous mmap()-ed "map initialization image" is remapped as a BPF map-backed mmap()-ed memory where for .rodata it's non-writable. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: w1tcher.bupt@gmail.com Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- kernel/bpf/syscall.c | 57 +++++++++++++++++++++++++++---------------- kernel/bpf/verifier.c | 17 ++++++++++++- 3 files changed, 54 insertions(+), 23 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f715e8863f4d..e7a163a3146b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -193,7 +193,7 @@ struct bpf_map { atomic64_t usercnt; struct work_struct work; struct mutex freeze_mutex; - u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ + atomic64_t writecnt; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -1419,6 +1419,7 @@ void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); +bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50f96ea4452a..1033ee8c0caf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -132,6 +132,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } +static void bpf_map_write_active_inc(struct bpf_map *map) +{ + atomic64_inc(&map->writecnt); +} + +static void bpf_map_write_active_dec(struct bpf_map *map) +{ + atomic64_dec(&map->writecnt); +} + +bool bpf_map_write_active(const struct bpf_map *map) +{ + return atomic64_read(&map->writecnt) != 0; +} + static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -601,11 +616,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt++; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_inc(map); } /* called for all unmapped memory region (including initial) */ @@ -613,11 +625,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt--; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_dec(map); } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -668,7 +677,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) goto out; if (vma->vm_flags & VM_MAYWRITE) - map->writecnt++; + bpf_map_write_active_inc(map); out: mutex_unlock(&map->freeze_mutex); return err; @@ -1139,6 +1148,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1174,6 +1184,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1196,6 +1207,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1226,6 +1238,7 @@ static int map_delete_elem(union bpf_attr *attr) out: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1533,6 +1546,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; @@ -1597,6 +1611,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1624,8 +1639,7 @@ static int map_freeze(const union bpf_attr *attr) } mutex_lock(&map->freeze_mutex); - - if (map->writecnt) { + if (bpf_map_write_active(map)) { err = -EBUSY; goto err_put; } @@ -4171,6 +4185,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, union bpf_attr __user *uattr, int cmd) { + bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; + bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; int err, ufd; struct fd f; @@ -4183,16 +4200,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if ((cmd == BPF_MAP_LOOKUP_BATCH || - cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && - !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + if (has_write) + bpf_map_write_active_inc(map); + if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } - - if (cmd != BPF_MAP_LOOKUP_BATCH && - !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -4205,8 +4219,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, BPF_DO_BATCH(map->ops->map_update_batch); else BPF_DO_BATCH(map->ops->map_delete_batch); - err_put: + if (has_write) + bpf_map_write_active_dec(map); fdput(f); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65d2f93b7030..50efda51515b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4056,7 +4056,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static bool bpf_map_is_rdonly(const struct bpf_map *map) { - return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; + /* A map is considered read-only if the following condition are true: + * + * 1) BPF program side cannot change any of the map content. The + * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map + * and was set at map creation time. + * 2) The map value(s) have been initialized from user space by a + * loader and then "frozen", such that no new map update/delete + * operations from syscall side are possible for the rest of + * the map's lifetime from that point onwards. + * 3) Any parallel/pending map update/delete operations from syscall + * side have been completed. Only after that point, it's safe to + * assume that map value(s) are immutable. + */ + return (map->map_flags & BPF_F_RDONLY_PROG) && + READ_ONCE(map->frozen) && + !bpf_map_write_active(map); } static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) From b3ff2881ba18b852f79f5476d7631940071f1adb Mon Sep 17 00:00:00 2001 From: Wang Haojun Date: Wed, 3 Nov 2021 10:55:21 +0800 Subject: [PATCH 111/336] MIPS: syscalls: Wire up futex_waitv syscall Wire up the futex_waitv syscall. Fix Build warning: #warning syscall futex_waitv not implemented [-Wcpp] Signed-off-by: Wang Haojun Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 70e32de2bcaa..72d02d363f36 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -387,3 +387,4 @@ 446 n32 landlock_restrict_self sys_landlock_restrict_self # 447 reserved for memfd_secret 448 n32 process_mrelease sys_process_mrelease +449 n32 futex_waitv sys_futex_waitv diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 1ca7bc337932..e2c481fcede6 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -363,3 +363,4 @@ 446 n64 landlock_restrict_self sys_landlock_restrict_self # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease +449 n64 futex_waitv sys_futex_waitv diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index a61c35edaa74..3714c97b2643 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -436,3 +436,4 @@ 446 o32 landlock_restrict_self sys_landlock_restrict_self # 447 reserved for memfd_secret 448 o32 process_mrelease sys_process_mrelease +449 o32 futex_waitv sys_futex_waitv From 255e51da15baed47531beefd02f222e4dc01f1c1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Nov 2021 23:28:24 +0000 Subject: [PATCH 112/336] MIPS: generic/yamon-dt: fix uninitialized variable error In the case where fw_getenv returns an error when fetching values for ememsizea and memsize then variable phys_memsize is not assigned a variable and will be uninitialized on a zero check of phys_memsize. Fix this by initializing phys_memsize to zero. Cleans up cppcheck error: arch/mips/generic/yamon-dt.c:100:7: error: Uninitialized variable: phys_memsize [uninitvar] Fixes: f41d2430bbd6 ("MIPS: generic/yamon-dt: Support > 256MB of RAM") Signed-off-by: Colin Ian King Signed-off-by: Thomas Bogendoerfer --- arch/mips/generic/yamon-dt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/generic/yamon-dt.c b/arch/mips/generic/yamon-dt.c index a3aa22c77cad..a07a5edbcda7 100644 --- a/arch/mips/generic/yamon-dt.c +++ b/arch/mips/generic/yamon-dt.c @@ -75,7 +75,7 @@ static unsigned int __init gen_fdt_mem_array( __init int yamon_dt_append_memory(void *fdt, const struct yamon_mem_region *regions) { - unsigned long phys_memsize, memsize; + unsigned long phys_memsize = 0, memsize; __be32 mem_array[2 * MAX_MEM_ARRAY_ENTRIES]; unsigned int mem_entries; int i, err, mem_off; From e8f67482e5a4bc8d0b65d606d08cb60ee123b468 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Nov 2021 16:42:18 -0800 Subject: [PATCH 113/336] mips: bcm63xx: add support for clk_get_parent() BCM63XX selects HAVE_LEGACY_CLK but does not provide/support clk_get_parent(), so add a simple implementation of that function so that callers of it will build without errors. Fixes these build errors: mips-linux-ld: drivers/iio/adc/ingenic-adc.o: in function `jz4770_adc_init_clk_div': ingenic-adc.c:(.text+0xe4): undefined reference to `clk_get_parent' mips-linux-ld: drivers/iio/adc/ingenic-adc.o: in function `jz4725b_adc_init_clk_div': ingenic-adc.c:(.text+0x1b8): undefined reference to `clk_get_parent' Fixes: e7300d04bd08 ("MIPS: BCM63xx: Add support for the Broadcom BCM63xx family of SOCs." ) Signed-off-by: Randy Dunlap Reported-by: kernel test robot Suggested-by: Russell King (Oracle) Cc: Artur Rojek Cc: Paul Cercueil Cc: linux-mips@vger.kernel.org Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: linux-iio@vger.kernel.org Cc: Florian Fainelli Cc: Andy Shevchenko Cc: Russell King Cc: bcm-kernel-feedback-list@broadcom.com Cc: Jonas Gorski Reviewed-by: Andy Shevchenko Acked-by: Jonathan Cameron Acked-by: Russell King (Oracle) Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- arch/mips/bcm63xx/clk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/mips/bcm63xx/clk.c b/arch/mips/bcm63xx/clk.c index 5a3e325275d0..1c91064cb448 100644 --- a/arch/mips/bcm63xx/clk.c +++ b/arch/mips/bcm63xx/clk.c @@ -381,6 +381,12 @@ void clk_disable(struct clk *clk) EXPORT_SYMBOL(clk_disable); +struct clk *clk_get_parent(struct clk *clk) +{ + return NULL; +} +EXPORT_SYMBOL(clk_get_parent); + unsigned long clk_get_rate(struct clk *clk) { if (!clk) From fc1aabb088860d6cf9dd03612b7a6f0de91ccac2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Nov 2021 17:20:51 -0800 Subject: [PATCH 114/336] mips: lantiq: add support for clk_get_parent() Provide a simple implementation of clk_get_parent() in the lantiq subarch so that callers of it will build without errors. Fixes this build error: ERROR: modpost: "clk_get_parent" [drivers/iio/adc/ingenic-adc.ko] undefined! Fixes: 171bb2f19ed6 ("MIPS: Lantiq: Add initial support for Lantiq SoCs") Signed-off-by: Randy Dunlap Suggested-by: Russell King (Oracle) Cc: linux-mips@vger.kernel.org Cc: John Crispin Cc: Thomas Bogendoerfer Cc: Jonathan Cameron Cc: linux-iio@vger.kernel.org Cc: Russell King Cc: Andy Shevchenko Acked-by: Jonathan Cameron Acked-by: John Crispin Signed-off-by: Thomas Bogendoerfer --- arch/mips/lantiq/clk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/mips/lantiq/clk.c b/arch/mips/lantiq/clk.c index dd819e31fcbb..4916cccf378f 100644 --- a/arch/mips/lantiq/clk.c +++ b/arch/mips/lantiq/clk.c @@ -158,6 +158,12 @@ void clk_deactivate(struct clk *clk) } EXPORT_SYMBOL(clk_deactivate); +struct clk *clk_get_parent(struct clk *clk) +{ + return NULL; +} +EXPORT_SYMBOL(clk_get_parent); + static inline u32 get_counter_resolution(void) { u32 res; From d6912b1251b47e6b04ea8c8881dfb35a6e7a3e29 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Nov 2021 22:46:27 -0800 Subject: [PATCH 115/336] gpio: rockchip: needs GENERIC_IRQ_CHIP to fix build errors gpio-rockchip uses interfaces that are provided by the Kconfig symbol GENERIC_IRQ_CHIP, so the driver should select that symbol in order to prevent build errors. Fixes these build errors (and more): aarch64-linux-ld: drivers/gpio/gpio-rockchip.o: in function `rockchip_irq_disable': gpio-rockchip.c:(.text+0x454): undefined reference to `irq_gc_mask_set_bit' aarch64-linux-ld: drivers/gpio/gpio-rockchip.o: in function `rockchip_irq_enable': gpio-rockchip.c:(.text+0x478): undefined reference to `irq_gc_mask_clr_bit' aarch64-linux-ld: drivers/gpio/gpio-rockchip.o: in function `rockchip_interrupts_register': gpio-rockchip.c:(.text+0x518): undefined reference to `irq_generic_chip_ops' aarch64-linux-ld: gpio-rockchip.c:(.text+0x594): undefined reference to `__irq_alloc_domain_generic_chips' aarch64-linux-ld: gpio-rockchip.c:(.text+0x5cc): undefined reference to `irq_get_domain_generic_chip' aarch64-linux-ld: gpio-rockchip.c:(.text+0x5e0): undefined reference to `irq_gc_ack_set_bit' aarch64-linux-ld: gpio-rockchip.c:(.text+0x604): undefined reference to `irq_gc_set_wake' Fixes: 936ee2675eee ("gpio/rockchip: add driver for rockchip gpio") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 072ed610f9c6..60d9374c72c0 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -523,6 +523,7 @@ config GPIO_REG config GPIO_ROCKCHIP tristate "Rockchip GPIO support" depends on ARCH_ROCKCHIP || COMPILE_TEST + select GENERIC_IRQ_CHIP select GPIOLIB_IRQCHIP default ARCH_ROCKCHIP help From 4eaf02d6076c138d929f98b4c8afc4fef6d2915d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 18 Oct 2021 21:27:55 +0200 Subject: [PATCH 116/336] drm/scheduler: fix drm_sched_job_add_implicit_dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trivial fix since we now need to grab a reference to the fence we have added. Previously the dma_resv function where doing that for us. Signed-off-by: Christian König Fixes: 9c2ba265352a ("drm/scheduler: use new iterator in drm_sched_job_add_implicit_dependencies v2") Link: https://patchwork.freedesktop.org/patch/msgid/20211019112706.27769-1-christian.koenig@amd.com Reviewed-by: Daniel Vetter Reported-by: Nicolas Frattaroli References: https://lore.kernel.org/dri-devel/2023306.UmlnhvANQh@archbook/ Tested-by: Nicolas Frattaroli Tested-by: Yassine Oudjana --- drivers/gpu/drm/scheduler/sched_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 5bc5f775abe1..94fe51b3caa2 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -707,6 +707,9 @@ int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job, ret = drm_sched_job_add_dependency(job, fence); if (ret) return ret; + + /* Make sure to grab an additional ref on the added fence */ + dma_fence_get(fence); } return 0; } From 287273a80be5d45d59d1742557cab69cabf0ef3c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Nov 2021 10:43:46 +0300 Subject: [PATCH 117/336] platform/mellanox: mlxreg-lc: fix error code in mlxreg_lc_create_static_devices() This code should be using PTR_ERR() instead of IS_ERR(). And because it's using the wrong "dev->client" pointer, the IS_ERR() check will be false, meaning the function returns success. Fixes: 62f9529b8d5c ("platform/mellanox: mlxreg-lc: Add initial support for Nvidia line card devices") Signed-off-by: Dan Carpenter Acked-by: Vadim Pasternak Link: https://lore.kernel.org/r/20211110074346.GB5176@kili Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/mellanox/mlxreg-lc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/platform/mellanox/mlxreg-lc.c b/drivers/platform/mellanox/mlxreg-lc.c index 0b7f58feb701..c897a2f15840 100644 --- a/drivers/platform/mellanox/mlxreg-lc.c +++ b/drivers/platform/mellanox/mlxreg-lc.c @@ -413,7 +413,7 @@ mlxreg_lc_create_static_devices(struct mlxreg_lc *mlxreg_lc, struct mlxreg_hotpl int size) { struct mlxreg_hotplug_device *dev = devs; - int i; + int i, ret; /* Create static I2C device feeding by auxiliary or main power. */ for (i = 0; i < size; i++, dev++) { @@ -423,6 +423,7 @@ mlxreg_lc_create_static_devices(struct mlxreg_lc *mlxreg_lc, struct mlxreg_hotpl dev->brdinfo->type, dev->nr, dev->brdinfo->addr); dev->adapter = NULL; + ret = PTR_ERR(dev->client); goto fail_create_static_devices; } } @@ -435,7 +436,7 @@ fail_create_static_devices: i2c_unregister_device(dev->client); dev->client = NULL; } - return IS_ERR(dev->client); + return ret; } static void From 707f0c290f2b0a16c0222bb3272079cfe5af831d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 2 Nov 2021 16:32:56 +0100 Subject: [PATCH 118/336] platform/x86: amd-pmc: Make CONFIG_AMD_PMC depend on RTC_CLASS Since the "Add special handling for timer based S0i3 wakeup" changes the amd-pmc code now relies on symbols from the RTC-class code, add a dependency for this to Kconfig. Fixes: 59348401ebed ("platform/x86: amd-pmc: Add special handling for timer based S0i3 wakeup") Cc: Mario Limonciello Reported-by: Randy Dunlap Signed-off-by: Hans de Goede Acked-by: Mario Limonciello Link: https://lore.kernel.org/r/20211102153256.76956-1-hdegoede@redhat.com --- drivers/platform/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index d4c079f4afc6..7400bc5da5be 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -185,7 +185,7 @@ config ACER_WMI config AMD_PMC tristate "AMD SoC PMC driver" - depends on ACPI && PCI + depends on ACPI && PCI && RTC_CLASS help The driver provides support for AMD Power Management Controller primarily responsible for S2Idle transactions that are driven from From c961a7d2aa23ae19e0099fbcdf1040fb760eea83 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Nov 2021 20:57:07 +0100 Subject: [PATCH 119/336] platform/x86: hp_accel: Fix an error handling path in 'lis3lv02d_probe()' If 'led_classdev_register()' fails, some additional resources should be released. Add the missing 'i8042_remove_filter()' and 'lis3lv02d_remove_fs()' calls that are already in the remove function but are missing here. Fixes: a4c724d0723b ("platform: hp_accel: add a i8042 filter to remove HPQ6000 data from kb bus stream") Fixes: 9e0c79782143 ("lis3lv02d: merge with leds hp disk") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/5a4f218f8f16d2e3a7906b7ca3654ffa946895f8.1636314074.git.christophe.jaillet@wanadoo.fr Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/hp_accel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/hp_accel.c b/drivers/platform/x86/hp_accel.c index b183967ecfb7..435a91fe2568 100644 --- a/drivers/platform/x86/hp_accel.c +++ b/drivers/platform/x86/hp_accel.c @@ -331,9 +331,11 @@ static int lis3lv02d_probe(struct platform_device *device) INIT_WORK(&hpled_led.work, delayed_set_status_worker); ret = led_classdev_register(NULL, &hpled_led.led_classdev); if (ret) { + i8042_remove_filter(hp_accel_i8042_filter); lis3lv02d_joystick_disable(&lis3_dev); lis3lv02d_poweroff(&lis3_dev); flush_work(&hpled_led.work); + lis3lv02d_remove_fs(&lis3_dev); return ret; } From 3e58e1c4da396427eb39a72933e47bd4d35bfc0f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Sat, 13 Nov 2021 13:48:27 +0800 Subject: [PATCH 120/336] platform/x86: samsung-laptop: Fix typo in a comment The double `it' is repeated in a comment, therefore one of them is removed. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20211113054827.199517-1-wangborong@cdjrlc.com Signed-off-by: Hans de Goede --- drivers/platform/x86/samsung-laptop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/samsung-laptop.c b/drivers/platform/x86/samsung-laptop.c index 7ee010aa740a..c1d9ed9b7b67 100644 --- a/drivers/platform/x86/samsung-laptop.c +++ b/drivers/platform/x86/samsung-laptop.c @@ -152,7 +152,7 @@ struct sabi_config { static const struct sabi_config sabi_configs[] = { { - /* I don't know if it is really 2, but it it is + /* I don't know if it is really 2, but it is * less than 3 anyway */ .sabi_version = 2, From 0f07c023dcd08ca49b6d3dd018abc7cd56301478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 13 Nov 2021 09:05:51 +0100 Subject: [PATCH 121/336] platform/x86: dell-wmi-descriptor: disable by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dell-wmi-descriptor only provides symbols to other drivers. These drivers already select dell-wmi-descriptor when needed. This fixes an issue where dell-wmi-descriptor is compiled as a module with localyesconfig on a non-Dell machine. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20211113080551.61860-1-linux@weissschuh.net Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/dell/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell/Kconfig b/drivers/platform/x86/dell/Kconfig index 2fffa57e596e..fe224a54f24c 100644 --- a/drivers/platform/x86/dell/Kconfig +++ b/drivers/platform/x86/dell/Kconfig @@ -187,7 +187,7 @@ config DELL_WMI_AIO config DELL_WMI_DESCRIPTOR tristate - default m + default n depends on ACPI_WMI config DELL_WMI_LED From 812fcc609502096e98cc3918a4b807722dba8fd9 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 8 Nov 2021 11:03:57 -0700 Subject: [PATCH 122/336] platform/x86: think-lmi: Abort probe on analyze failure A Lenovo ThinkStation S20 (4157CTO BIOS 60KT41AUS) fails to boot on recent kernels including the think-lmi driver, due to the fact that errors returned by the tlmi_analyze() function are ignored by tlmi_probe(), where tlmi_sysfs_init() is called unconditionally. This results in making use of an array of already freed, non-null pointers and other uninitialized globals, causing all sorts of nasty kobject and memory faults. Make use of the analyze function return value, free a couple leaked allocations, and remove the settings_count field, which is incremented but never consumed. Fixes: a40cd7ef22fb ("platform/x86: think-lmi: Add WMI interface support on Lenovo platforms") Signed-off-by: Alex Williamson Reviewed-by: Mark Gross Reviewed-by: Mark Pearson Link: https://lore.kernel.org/r/163639463588.1330483.15850167112490200219.stgit@omen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/think-lmi.c | 13 ++++++++++--- drivers/platform/x86/think-lmi.h | 1 - 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/think-lmi.c b/drivers/platform/x86/think-lmi.c index 9472aae72df2..c4d9c45350f7 100644 --- a/drivers/platform/x86/think-lmi.c +++ b/drivers/platform/x86/think-lmi.c @@ -888,8 +888,10 @@ static int tlmi_analyze(void) break; if (!item) break; - if (!*item) + if (!*item) { + kfree(item); continue; + } /* It is not allowed to have '/' for file name. Convert it into '\'. */ strreplace(item, '/', '\\'); @@ -902,6 +904,7 @@ static int tlmi_analyze(void) setting = kzalloc(sizeof(*setting), GFP_KERNEL); if (!setting) { ret = -ENOMEM; + kfree(item); goto fail_clear_attr; } setting->index = i; @@ -916,7 +919,6 @@ static int tlmi_analyze(void) } kobject_init(&setting->kobj, &tlmi_attr_setting_ktype); tlmi_priv.setting[i] = setting; - tlmi_priv.settings_count++; kfree(item); } @@ -983,7 +985,12 @@ static void tlmi_remove(struct wmi_device *wdev) static int tlmi_probe(struct wmi_device *wdev, const void *context) { - tlmi_analyze(); + int ret; + + ret = tlmi_analyze(); + if (ret) + return ret; + return tlmi_sysfs_init(); } diff --git a/drivers/platform/x86/think-lmi.h b/drivers/platform/x86/think-lmi.h index f8e26823075f..2ce5086a5af2 100644 --- a/drivers/platform/x86/think-lmi.h +++ b/drivers/platform/x86/think-lmi.h @@ -55,7 +55,6 @@ struct tlmi_attr_setting { struct think_lmi { struct wmi_device *wmi_device; - int settings_count; bool can_set_bios_settings; bool can_get_bios_selections; bool can_set_bios_password; From 1f338954a5fbe21eb22b4223141e31f2a26366d5 Mon Sep 17 00:00:00 2001 From: Jimmy Wang Date: Fri, 5 Nov 2021 17:05:28 +0800 Subject: [PATCH 123/336] platform/x86: thinkpad_acpi: Add support for dual fan control This adds dual fan control for P1 / X1 Extreme Gen4 Signed-off-by: Jimmy Wang Link: https://lore.kernel.org/r/20211105090528.39677-1-jimmy221b@163.com Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 9c632df734bb..eb201d001075 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -8766,6 +8766,7 @@ static const struct tpacpi_quirk fan_quirk_table[] __initconst = { TPACPI_Q_LNV3('N', '2', 'E', TPACPI_FAN_2CTL), /* P1 / X1 Extreme (1st gen) */ TPACPI_Q_LNV3('N', '2', 'O', TPACPI_FAN_2CTL), /* P1 / X1 Extreme (2nd gen) */ TPACPI_Q_LNV3('N', '2', 'V', TPACPI_FAN_2CTL), /* P1 / X1 Extreme (3nd gen) */ + TPACPI_Q_LNV3('N', '4', '0', TPACPI_FAN_2CTL), /* P1 / X1 Extreme (4nd gen) */ TPACPI_Q_LNV3('N', '3', '0', TPACPI_FAN_2CTL), /* P15 (1st gen) / P15v (1st gen) */ TPACPI_Q_LNV3('N', '3', '2', TPACPI_FAN_2CTL), /* X1 Carbon (9th gen) */ }; From 39f53292181081d35174a581a98441de5da22bc9 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Mon, 8 Nov 2021 14:06:48 +0800 Subject: [PATCH 124/336] platform/x86: thinkpad_acpi: Fix WWAN device disabled issue after S3 deep When WWAN device wake from S3 deep, under thinkpad platform, WWAN would be disabled. This disable status could be checked by command 'nmcli r wwan' or 'rfkill list'. Issue analysis as below: When host resume from S3 deep, thinkpad_acpi driver would call hotkey_resume() function. Finnaly, it will use wan_get_status to check the current status of WWAN device. During this resume progress, wan_get_status would always return off even WWAN boot up completely. In patch V2, Hans said 'sw_state should be unchanged after a suspend/resume. It's better to drop the tpacpi_rfk_update_swstate call all together from the resume path'. And it's confimed by Lenovo that GWAN is no longer available from WHL generation because the design does not match with current pin control. Signed-off-by: Slark Xiao Link: https://lore.kernel.org/r/20211108060648.8212-1-slark_xiao@163.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- drivers/platform/x86/thinkpad_acpi.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index eb201d001075..b3ac9c3f3b7c 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -1105,15 +1105,6 @@ static int tpacpi_rfk_update_swstate(const struct tpacpi_rfk *tp_rfk) return status; } -/* Query FW and update rfkill sw state for all rfkill switches */ -static void tpacpi_rfk_update_swstate_all(void) -{ - unsigned int i; - - for (i = 0; i < TPACPI_RFK_SW_MAX; i++) - tpacpi_rfk_update_swstate(tpacpi_rfkill_switches[i]); -} - /* * Sync the HW-blocking state of all rfkill switches, * do notice it causes the rfkill core to schedule uevents @@ -3074,9 +3065,6 @@ static void tpacpi_send_radiosw_update(void) if (wlsw == TPACPI_RFK_RADIO_OFF) tpacpi_rfk_update_hwblock_state(true); - /* Sync sw blocking state */ - tpacpi_rfk_update_swstate_all(); - /* Sync hw blocking state last if it is hw-unblocked */ if (wlsw == TPACPI_RFK_RADIO_ON) tpacpi_rfk_update_hwblock_state(false); From d477a907cba317cfa58a8c89c09454d3fced1964 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 9 Nov 2021 20:52:09 +0100 Subject: [PATCH 125/336] platform/x86: thinkpad_acpi: fix documentation for adaptive keyboard The different values were offset by 1. 0 is for "home mode", 1 for "web-browser mode", etc. Moreover, the URL to the laptop's user guide did not work anymore. Signed-off-by: Vincent Bernat Link: https://lore.kernel.org/r/20211109195209.176905-1-vincent@bernat.ch Signed-off-by: Hans de Goede --- Documentation/admin-guide/laptops/thinkpad-acpi.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst index 6721a80a2d4f..475eb0e81e4a 100644 --- a/Documentation/admin-guide/laptops/thinkpad-acpi.rst +++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst @@ -1520,15 +1520,15 @@ This sysfs attribute controls the keyboard "face" that will be shown on the Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read and set. -- 1 = Home mode -- 2 = Web-browser mode -- 3 = Web-conference mode -- 4 = Function mode -- 5 = Layflat mode +- 0 = Home mode +- 1 = Web-browser mode +- 2 = Web-conference mode +- 3 = Function mode +- 4 = Layflat mode For more details about which buttons will appear depending on the mode, please review the laptop's user guide: -http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf +https://download.lenovo.com/ibmdl/pub/pc/pccbbs/mobiles_pdf/x1carbon_2_ug_en.pdf Battery charge control ---------------------- From 5b54860943dc4681be5de2fc287408c7ce274dfc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 15 Nov 2021 10:05:32 +0100 Subject: [PATCH 126/336] powerpc/book3e: Fix TLBCAM preset at boot Commit 52bda69ae8b5 ("powerpc/fsl_booke: Tell map_mem_in_cams() if init is done") was supposed to just add an additional parameter to map_mem_in_cams() and always set it to 'true' at that time. But a few call sites were messed up. Fix them. Fixes: 52bda69ae8b5 ("powerpc/fsl_booke: Tell map_mem_in_cams() if init is done") Reported-by: Christian Zigotzky Signed-off-by: Christophe Leroy Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d319f2a9367d4d08fd2154e506101bd5f100feeb.1636967119.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/nohash/kaslr_booke.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 8fc49b1b4a91..6ec978967da0 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -314,7 +314,7 @@ static unsigned long __init kaslr_choose_location(void *dt_ptr, phys_addr_t size pr_warn("KASLR: No safe seed for randomizing the kernel base.\n"); ram = min_t(phys_addr_t, __max_low_memory, size); - ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, false); + ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, true); linear_sz = min_t(unsigned long, ram, SZ_512M); /* If the linear size is smaller than 64M, do not randmize */ diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 89353d4f5604..647bf454a0fa 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -645,7 +645,7 @@ static void early_init_this_mmu(void) if (map) linear_map_top = map_mem_in_cams(linear_map_top, - num_cams, true, true); + num_cams, false, true); } #endif @@ -766,7 +766,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; linear_sz = map_mem_in_cams(first_memblock_size, num_cams, - false, true); + true, true); ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); } else From 5499802b2284331788a440585869590f1bd63f7f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 15 Nov 2021 09:52:55 +0100 Subject: [PATCH 127/336] powerpc/signal32: Fix sigset_t copy The conversion from __copy_from_user() to __get_user() by commit d3ccc9781560 ("powerpc/signal: Use __get_user() to copy sigset_t") introduced a regression in __get_user_sigset() for powerpc/32. The bug was subsequently moved into unsafe_get_user_sigset(). The bug is due to the copied 64 bit value being truncated to 32 bits while being assigned to dst->sig[0] The regression was reported by users of the Xorg packages distributed in Debian/powerpc -- "The symptoms are that the fb screen goes blank, with the backlight remaining on and no errors logged in /var/log; wdm (or startx) run with no effect (I tried logging in in the blind, with no effect). And they are hard to kill, requiring 'kill -KILL ...'" Fix the regression by copying each word of the sigset, not only the first one. __get_user_sigset() was tentatively optimised to copy 64 bits at once in order to minimise KUAP unlock/lock impact, but the unsafe variant doesn't suffer that, so it can just copy words. Fixes: 887f3ceb51cd ("powerpc/signal32: Convert do_setcontext[_tm]() to user access block") Cc: stable@vger.kernel.org # v5.13+ Reported-by: Finn Thain Reported-and-tested-by: Stan Johnson Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/99ef38d61c0eb3f79c68942deb0c35995a93a777.1636966353.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/signal.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index 1f07317964e4..618aeccdf691 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -25,8 +25,14 @@ static inline int __get_user_sigset(sigset_t *dst, const sigset_t __user *src) return __get_user(dst->sig[0], (u64 __user *)&src->sig[0]); } -#define unsafe_get_user_sigset(dst, src, label) \ - unsafe_get_user((dst)->sig[0], (u64 __user *)&(src)->sig[0], label) +#define unsafe_get_user_sigset(dst, src, label) do { \ + sigset_t *__dst = dst; \ + const sigset_t __user *__src = src; \ + int i; \ + \ + for (i = 0; i < _NSIG_WORDS; i++) \ + unsafe_get_user(__dst->sig[i], &__src->sig[i], label); \ +} while (0) #ifdef CONFIG_VSX extern unsigned long copy_vsx_to_user(void __user *to, From 1e35eba4055149c578baf0318d2f2f89ea3c44a0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 15 Nov 2021 09:08:36 +0100 Subject: [PATCH 128/336] powerpc/8xx: Fix pinned TLBs with CONFIG_STRICT_KERNEL_RWX As spotted and explained in commit c12ab8dbc492 ("powerpc/8xx: Fix Oops with STRICT_KERNEL_RWX without DEBUG_RODATA_TEST"), the selection of STRICT_KERNEL_RWX without selecting DEBUG_RODATA_TEST has spotted the lack of the DIRTY bit in the pinned kernel data TLBs. This problem should have been detected a lot earlier if things had been working as expected. But due to an incredible level of chance or mishap, this went undetected because of a set of bugs: In fact the DTLBs were not pinned, because instead of setting the reserve bit in MD_CTR, it was set in MI_CTR that is the register for ITLBs. But then, another huge bug was there: the physical address was reset to 0 at the boundary between RO and RW areas, leading to the same physical space being mapped at both 0xc0000000 and 0xc8000000. This had by miracle no consequence until now because the entry was not really pinned so it was overwritten soon enough to go undetected. Of course, now that we really pin the DTLBs, it must be fixed as well. Fixes: f76c8f6d257c ("powerpc/8xx: Add function to set pinned TLBs") Cc: stable@vger.kernel.org # v5.8+ Signed-off-by: Christophe Leroy Depends-on: c12ab8dbc492 ("powerpc/8xx: Fix Oops with STRICT_KERNEL_RWX without DEBUG_RODATA_TEST") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a21e9a057fe2d247a535aff0d157a54eefee017a.1636963688.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_8xx.S | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 2d596881b70e..0d073b9fd52c 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -733,6 +733,7 @@ _GLOBAL(mmu_pin_tlb) #ifdef CONFIG_PIN_TLB_DATA LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET) LOAD_REG_IMMEDIATE(r7, MI_SVALID | MI_PS8MEG | _PMD_ACCESSED) + li r8, 0 #ifdef CONFIG_PIN_TLB_IMMR li r0, 3 #else @@ -741,26 +742,26 @@ _GLOBAL(mmu_pin_tlb) mtctr r0 cmpwi r4, 0 beq 4f - LOAD_REG_IMMEDIATE(r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT) LOAD_REG_ADDR(r9, _sinittext) 2: ori r0, r6, MD_EVALID + ori r12, r8, 0xf0 | _PAGE_RO | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT mtspr SPRN_MD_CTR, r5 mtspr SPRN_MD_EPN, r0 mtspr SPRN_MD_TWC, r7 - mtspr SPRN_MD_RPN, r8 + mtspr SPRN_MD_RPN, r12 addi r5, r5, 0x100 addis r6, r6, SZ_8M@h addis r8, r8, SZ_8M@h cmplw r6, r9 bdnzt lt, 2b - -4: LOAD_REG_IMMEDIATE(r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT) +4: 2: ori r0, r6, MD_EVALID + ori r12, r8, 0xf0 | _PAGE_DIRTY | _PAGE_SPS | _PAGE_SH | _PAGE_PRESENT mtspr SPRN_MD_CTR, r5 mtspr SPRN_MD_EPN, r0 mtspr SPRN_MD_TWC, r7 - mtspr SPRN_MD_RPN, r8 + mtspr SPRN_MD_RPN, r12 addi r5, r5, 0x100 addis r6, r6, SZ_8M@h addis r8, r8, SZ_8M@h @@ -781,7 +782,7 @@ _GLOBAL(mmu_pin_tlb) #endif #if defined(CONFIG_PIN_TLB_IMMR) || defined(CONFIG_PIN_TLB_DATA) lis r0, (MD_RSV4I | MD_TWAM)@h - mtspr SPRN_MI_CTR, r0 + mtspr SPRN_MD_CTR, r0 #endif mtspr SPRN_SRR1, r10 mtspr SPRN_SRR0, r11 From 420f48f636b98fd685f44a3acc4c0a7c0840910d Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 14 Oct 2021 13:33:45 +0200 Subject: [PATCH 129/336] s390/setup: avoid reserving memory above identity mapping Such reserved memory region, if not cleaned up later causes problems when memblock_free_all() is called to release free pages to the buddy allocator and those reserved regions are carried over to reserve_bootmem_region() which marks the pages as PageReserved. Instead use memblock_set_current_limit() to make sure memblock allocations do not go over identity mapping (which could happen when "mem=" option is used or during kdump). Cc: stable@vger.kernel.org Fixes: 73045a08cf55 ("s390: unify identity mapping limits handling") Reported-by: Gerald Schaefer Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/kernel/setup.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 40405f2304f1..30aba0f21e85 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -637,14 +637,6 @@ static struct notifier_block kdump_mem_nb = { #endif -/* - * Make sure that the area above identity mapping is protected - */ -static void __init reserve_above_ident_map(void) -{ - memblock_reserve(ident_map_size, ULONG_MAX); -} - /* * Reserve memory for kdump kernel to be loaded with kexec */ @@ -999,11 +991,11 @@ void __init setup_arch(char **cmdline_p) setup_control_program_code(); /* Do some memory reservations *before* memory is added to memblock */ - reserve_above_ident_map(); reserve_kernel(); reserve_initrd(); reserve_certificate_list(); reserve_mem_detect_info(); + memblock_set_current_limit(ident_map_size); memblock_allow_resize(); /* Get information about *all* installed memory */ From 5dbc4cb4667457b0c53bcd7bff11500b3c362975 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 14 Oct 2021 13:38:17 +0200 Subject: [PATCH 130/336] s390/setup: avoid using memblock_enforce_memory_limit There is a difference in how architectures treat "mem=" option. For some that is an amount of online memory, for s390 and x86 this is the limiting max address. Some memblock api like memblock_enforce_memory_limit() take limit argument and explicitly treat it as the size of online memory, and use __find_max_addr to convert it to an actual max address. Current s390 usage: memblock_enforce_memory_limit(memblock_end_of_DRAM()); yields different results depending on presence of memory holes (offline memory blocks in between online memory). If there are no memory holes limit == max_addr in memblock_enforce_memory_limit() and it does trim online memory and reserved memory regions. With memory holes present it actually does nothing. Since we already use memblock_remove() explicitly to trim online memory regions to potential limit (think mem=, kdump, addressing limits, etc.) drop the usage of memblock_enforce_memory_limit() altogether. Trimming reserved regions should not be required, since we now use memblock_set_current_limit() to limit allocations and any explicit memory reservations above the limit is an actual problem we should not hide. Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/kernel/setup.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 30aba0f21e85..349d24df37b8 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -818,9 +818,6 @@ static void __init setup_memory(void) storage_key_init_range(start, end); psw_set_key(PAGE_DEFAULT_KEY); - - /* Only cosmetics */ - memblock_enforce_memory_limit(memblock_end_of_DRAM()); } static void __init relocate_amode31_section(void) From 6ad5f024d1f5612b9e39ced9f1add6e8121a7afb Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 14 Oct 2021 13:45:35 +0200 Subject: [PATCH 131/336] s390/setup: re-arrange memblock setup - Avoid using ULONG_MAX in memblock_remove, it has no functional change but makes memblock_dbg output a range which makes sense. - Actually finish memblock memory setup before doing amode31/cr/uv setup. - Move memblock_dump_all() debug output after memblock memory setup is complete. This gives us final "memory" regions if they were trimmed due to addressing limits and still "physmem" regions as original info which came from mem_detect. Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/kernel/setup.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 349d24df37b8..225ab2d0a4c6 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -606,7 +606,7 @@ static void __init setup_resources(void) static void __init setup_memory_end(void) { - memblock_remove(ident_map_size, ULONG_MAX); + memblock_remove(ident_map_size, PHYS_ADDR_MAX - ident_map_size); max_pfn = max_low_pfn = PFN_DOWN(ident_map_size); pr_notice("The maximum memory size is %luMB\n", ident_map_size >> 20); } @@ -777,7 +777,6 @@ static void __init memblock_add_mem_detect_info(void) } memblock_set_bottom_up(false); memblock_set_node(0, ULONG_MAX, &memblock.memory, 0); - memblock_dump_all(); } /* @@ -999,13 +998,13 @@ void __init setup_arch(char **cmdline_p) memblock_add_mem_detect_info(); free_mem_detect_info(); + setup_memory_end(); + memblock_dump_all(); + setup_memory(); relocate_amode31_section(); setup_cr(); - setup_uv(); - setup_memory_end(); - setup_memory(); dma_contiguous_reserve(ident_map_size); vmcp_cma_reserve(); if (MACHINE_HAS_EDAT2) From 9a39abb7c9aab50eec4ac4421e9ee7f3de013d24 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Thu, 14 Oct 2021 13:53:54 +0200 Subject: [PATCH 132/336] s390/boot: simplify and fix kernel memory layout setup Initial KASAN shadow memory range was picked to preserve original kernel modules area position. With protected execution support, which might impose addressing limitation on vmalloc area and hence affect modules area position, current fixed KASAN shadow memory range is only making kernel memory layout setup more complex. So move it to the very end of available virtual space and simplify calculations. At the same time return to previous kernel address space split. In particular commit 0c4f2623b957 ("s390: setup kernel memory layout early") introduced precise identity map size calculation and keeping vmemmap left most starting from a fresh region table entry. This didn't take into account additional mapping region requirement for potential DCSS mapping above available physical memory. So go back to virtual space split between 1:1 mapping & vmemmap array once vmalloc area size is subtracted. Cc: stable@vger.kernel.org Fixes: 0c4f2623b957 ("s390: setup kernel memory layout early") Reported-by: Gerald Schaefer Reviewed-by: Heiko Carstens Reviewed-by: Alexander Gordeev Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 2 +- arch/s390/boot/startup.c | 88 ++++++++++++++-------------------------- 2 files changed, 32 insertions(+), 58 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8857ec3b97eb..35f99b8f236e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -47,7 +47,7 @@ config ARCH_SUPPORTS_UPROBES config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0x18000000000000 + default 0x1C000000000000 config S390 def_bool y diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 7571dee72a0c..1aa11a8f57dd 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -149,82 +149,56 @@ static void setup_ident_map_size(unsigned long max_physmem_end) static void setup_kernel_memory_layout(void) { - bool vmalloc_size_verified = false; - unsigned long vmemmap_off; - unsigned long vspace_left; + unsigned long vmemmap_start; unsigned long rte_size; unsigned long pages; - unsigned long vmax; pages = ident_map_size / PAGE_SIZE; /* vmemmap contains a multiple of PAGES_PER_SECTION struct pages */ vmemmap_size = SECTION_ALIGN_UP(pages) * sizeof(struct page); /* choose kernel address space layout: 4 or 3 levels. */ - vmemmap_off = round_up(ident_map_size, _REGION3_SIZE); + vmemmap_start = round_up(ident_map_size, _REGION3_SIZE); if (IS_ENABLED(CONFIG_KASAN) || vmalloc_size > _REGION2_SIZE || - vmemmap_off + vmemmap_size + vmalloc_size + MODULES_LEN > _REGION2_SIZE) - vmax = _REGION1_SIZE; - else - vmax = _REGION2_SIZE; - - /* keep vmemmap_off aligned to a top level region table entry */ - rte_size = vmax == _REGION1_SIZE ? _REGION2_SIZE : _REGION3_SIZE; - MODULES_END = vmax; - if (is_prot_virt_host()) { - /* - * forcing modules and vmalloc area under the ultravisor - * secure storage limit, so that any vmalloc allocation - * we do could be used to back secure guest storage. - */ - adjust_to_uv_max(&MODULES_END); - } - -#ifdef CONFIG_KASAN - if (MODULES_END < vmax) { - /* force vmalloc and modules below kasan shadow */ - MODULES_END = min(MODULES_END, KASAN_SHADOW_START); + vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > + _REGION2_SIZE) { + MODULES_END = _REGION1_SIZE; + rte_size = _REGION2_SIZE; } else { - /* - * leave vmalloc and modules above kasan shadow but make - * sure they don't overlap with it - */ - vmalloc_size = min(vmalloc_size, vmax - KASAN_SHADOW_END - MODULES_LEN); - vmalloc_size_verified = true; - vspace_left = KASAN_SHADOW_START; + MODULES_END = _REGION2_SIZE; + rte_size = _REGION3_SIZE; } + /* + * forcing modules and vmalloc area under the ultravisor + * secure storage limit, so that any vmalloc allocation + * we do could be used to back secure guest storage. + */ + adjust_to_uv_max(&MODULES_END); +#ifdef CONFIG_KASAN + /* force vmalloc and modules below kasan shadow */ + MODULES_END = min(MODULES_END, KASAN_SHADOW_START); #endif MODULES_VADDR = MODULES_END - MODULES_LEN; VMALLOC_END = MODULES_VADDR; - if (vmalloc_size_verified) { - VMALLOC_START = VMALLOC_END - vmalloc_size; - } else { - vmemmap_off = round_up(ident_map_size, rte_size); + /* allow vmalloc area to occupy up to about 1/2 of the rest virtual space left */ + vmalloc_size = min(vmalloc_size, round_down(VMALLOC_END / 2, _REGION3_SIZE)); + VMALLOC_START = VMALLOC_END - vmalloc_size; - if (vmemmap_off + vmemmap_size > VMALLOC_END || - vmalloc_size > VMALLOC_END - vmemmap_off - vmemmap_size) { - /* - * allow vmalloc area to occupy up to 1/2 of - * the rest virtual space left. - */ - vmalloc_size = min(vmalloc_size, VMALLOC_END / 2); - } - VMALLOC_START = VMALLOC_END - vmalloc_size; - vspace_left = VMALLOC_START; - } - - pages = vspace_left / (PAGE_SIZE + sizeof(struct page)); + /* split remaining virtual space between 1:1 mapping & vmemmap array */ + pages = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); pages = SECTION_ALIGN_UP(pages); - vmemmap_off = round_up(vspace_left - pages * sizeof(struct page), rte_size); - /* keep vmemmap left most starting from a fresh region table entry */ - vmemmap_off = min(vmemmap_off, round_up(ident_map_size, rte_size)); - /* take care that identity map is lower then vmemmap */ - ident_map_size = min(ident_map_size, vmemmap_off); + /* keep vmemmap_start aligned to a top level region table entry */ + vmemmap_start = round_down(VMALLOC_START - pages * sizeof(struct page), rte_size); + /* vmemmap_start is the future VMEM_MAX_PHYS, make sure it is within MAX_PHYSMEM */ + vmemmap_start = min(vmemmap_start, 1UL << MAX_PHYSMEM_BITS); + /* make sure identity map doesn't overlay with vmemmap */ + ident_map_size = min(ident_map_size, vmemmap_start); vmemmap_size = SECTION_ALIGN_UP(ident_map_size / PAGE_SIZE) * sizeof(struct page); - VMALLOC_START = max(vmemmap_off + vmemmap_size, VMALLOC_START); - vmemmap = (struct page *)vmemmap_off; + /* make sure vmemmap doesn't overlay with vmalloc area */ + VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); + vmemmap = (struct page *)vmemmap_start; } /* From 4b9e04367afe214e06736685a7962fcbadd8b0af Mon Sep 17 00:00:00 2001 From: Qing Wang Date: Thu, 14 Oct 2021 23:50:54 -0700 Subject: [PATCH 133/336] s390: replace snprintf in show functions with sysfs_emit show() must not use snprintf() when formatting the value to be returned to user space. Fix the coccicheck warnings: WARNING: use scnprintf or sprintf. Use sysfs_emit instead of scnprintf or sprintf makes more sense. Signed-off-by: Qing Wang Acked-by: Vineeth Vijayan Acked-by: Stefan Haberland Link: https://lore.kernel.org/r/1634280655-4908-1-git-send-email-wangqing@vivo.com [hca@linux.ibm.com: fix indentation] Signed-off-by: Heiko Carstens --- drivers/s390/block/dasd_devmap.c | 76 ++++++++++++++++---------------- drivers/s390/char/raw3270.c | 12 ++--- drivers/s390/cio/chp.c | 2 +- 3 files changed, 45 insertions(+), 45 deletions(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 2c40fe15da55..6043c832d09e 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -731,7 +731,7 @@ static ssize_t dasd_ff_show(struct device *dev, struct device_attribute *attr, ff_flag = (devmap->features & DASD_FEATURE_FAILFAST) != 0; else ff_flag = (DASD_FEATURE_DEFAULT & DASD_FEATURE_FAILFAST) != 0; - return snprintf(buf, PAGE_SIZE, ff_flag ? "1\n" : "0\n"); + return sysfs_emit(buf, ff_flag ? "1\n" : "0\n"); } static ssize_t dasd_ff_store(struct device *dev, struct device_attribute *attr, @@ -773,7 +773,7 @@ dasd_ro_show(struct device *dev, struct device_attribute *attr, char *buf) spin_unlock(&dasd_devmap_lock); out: - return snprintf(buf, PAGE_SIZE, ro_flag ? "1\n" : "0\n"); + return sysfs_emit(buf, ro_flag ? "1\n" : "0\n"); } static ssize_t @@ -834,7 +834,7 @@ dasd_erplog_show(struct device *dev, struct device_attribute *attr, char *buf) erplog = (devmap->features & DASD_FEATURE_ERPLOG) != 0; else erplog = (DASD_FEATURE_DEFAULT & DASD_FEATURE_ERPLOG) != 0; - return snprintf(buf, PAGE_SIZE, erplog ? "1\n" : "0\n"); + return sysfs_emit(buf, erplog ? "1\n" : "0\n"); } static ssize_t @@ -1033,13 +1033,13 @@ dasd_discipline_show(struct device *dev, struct device_attribute *attr, dasd_put_device(device); goto out; } else { - len = snprintf(buf, PAGE_SIZE, "%s\n", - device->discipline->name); + len = sysfs_emit(buf, "%s\n", + device->discipline->name); dasd_put_device(device); return len; } out: - len = snprintf(buf, PAGE_SIZE, "none\n"); + len = sysfs_emit(buf, "none\n"); return len; } @@ -1056,30 +1056,30 @@ dasd_device_status_show(struct device *dev, struct device_attribute *attr, if (!IS_ERR(device)) { switch (device->state) { case DASD_STATE_NEW: - len = snprintf(buf, PAGE_SIZE, "new\n"); + len = sysfs_emit(buf, "new\n"); break; case DASD_STATE_KNOWN: - len = snprintf(buf, PAGE_SIZE, "detected\n"); + len = sysfs_emit(buf, "detected\n"); break; case DASD_STATE_BASIC: - len = snprintf(buf, PAGE_SIZE, "basic\n"); + len = sysfs_emit(buf, "basic\n"); break; case DASD_STATE_UNFMT: - len = snprintf(buf, PAGE_SIZE, "unformatted\n"); + len = sysfs_emit(buf, "unformatted\n"); break; case DASD_STATE_READY: - len = snprintf(buf, PAGE_SIZE, "ready\n"); + len = sysfs_emit(buf, "ready\n"); break; case DASD_STATE_ONLINE: - len = snprintf(buf, PAGE_SIZE, "online\n"); + len = sysfs_emit(buf, "online\n"); break; default: - len = snprintf(buf, PAGE_SIZE, "no stat\n"); + len = sysfs_emit(buf, "no stat\n"); break; } dasd_put_device(device); } else - len = snprintf(buf, PAGE_SIZE, "unknown\n"); + len = sysfs_emit(buf, "unknown\n"); return len; } @@ -1120,7 +1120,7 @@ static ssize_t dasd_vendor_show(struct device *dev, device = dasd_device_from_cdev(to_ccwdev(dev)); vendor = ""; if (IS_ERR(device)) - return snprintf(buf, PAGE_SIZE, "%s\n", vendor); + return sysfs_emit(buf, "%s\n", vendor); if (device->discipline && device->discipline->get_uid && !device->discipline->get_uid(device, &uid)) @@ -1128,7 +1128,7 @@ static ssize_t dasd_vendor_show(struct device *dev, dasd_put_device(device); - return snprintf(buf, PAGE_SIZE, "%s\n", vendor); + return sysfs_emit(buf, "%s\n", vendor); } static DEVICE_ATTR(vendor, 0444, dasd_vendor_show, NULL); @@ -1148,7 +1148,7 @@ dasd_uid_show(struct device *dev, struct device_attribute *attr, char *buf) device = dasd_device_from_cdev(to_ccwdev(dev)); uid_string[0] = 0; if (IS_ERR(device)) - return snprintf(buf, PAGE_SIZE, "%s\n", uid_string); + return sysfs_emit(buf, "%s\n", uid_string); if (device->discipline && device->discipline->get_uid && !device->discipline->get_uid(device, &uid)) { @@ -1183,7 +1183,7 @@ dasd_uid_show(struct device *dev, struct device_attribute *attr, char *buf) } dasd_put_device(device); - return snprintf(buf, PAGE_SIZE, "%s\n", uid_string); + return sysfs_emit(buf, "%s\n", uid_string); } static DEVICE_ATTR(uid, 0444, dasd_uid_show, NULL); @@ -1201,7 +1201,7 @@ dasd_eer_show(struct device *dev, struct device_attribute *attr, char *buf) eer_flag = dasd_eer_enabled(devmap->device); else eer_flag = 0; - return snprintf(buf, PAGE_SIZE, eer_flag ? "1\n" : "0\n"); + return sysfs_emit(buf, eer_flag ? "1\n" : "0\n"); } static ssize_t @@ -1243,7 +1243,7 @@ dasd_expires_show(struct device *dev, struct device_attribute *attr, char *buf) device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) return -ENODEV; - len = snprintf(buf, PAGE_SIZE, "%lu\n", device->default_expires); + len = sysfs_emit(buf, "%lu\n", device->default_expires); dasd_put_device(device); return len; } @@ -1283,7 +1283,7 @@ dasd_retries_show(struct device *dev, struct device_attribute *attr, char *buf) device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) return -ENODEV; - len = snprintf(buf, PAGE_SIZE, "%lu\n", device->default_retries); + len = sysfs_emit(buf, "%lu\n", device->default_retries); dasd_put_device(device); return len; } @@ -1324,7 +1324,7 @@ dasd_timeout_show(struct device *dev, struct device_attribute *attr, device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) return -ENODEV; - len = snprintf(buf, PAGE_SIZE, "%lu\n", device->blk_timeout); + len = sysfs_emit(buf, "%lu\n", device->blk_timeout); dasd_put_device(device); return len; } @@ -1398,11 +1398,11 @@ static ssize_t dasd_hpf_show(struct device *dev, struct device_attribute *attr, return -ENODEV; if (!device->discipline || !device->discipline->hpf_enabled) { dasd_put_device(device); - return snprintf(buf, PAGE_SIZE, "%d\n", dasd_nofcx); + return sysfs_emit(buf, "%d\n", dasd_nofcx); } hpf = device->discipline->hpf_enabled(device); dasd_put_device(device); - return snprintf(buf, PAGE_SIZE, "%d\n", hpf); + return sysfs_emit(buf, "%d\n", hpf); } static DEVICE_ATTR(hpf, 0444, dasd_hpf_show, NULL); @@ -1416,13 +1416,13 @@ static ssize_t dasd_reservation_policy_show(struct device *dev, devmap = dasd_find_busid(dev_name(dev)); if (IS_ERR(devmap)) { - rc = snprintf(buf, PAGE_SIZE, "ignore\n"); + rc = sysfs_emit(buf, "ignore\n"); } else { spin_lock(&dasd_devmap_lock); if (devmap->features & DASD_FEATURE_FAILONSLCK) - rc = snprintf(buf, PAGE_SIZE, "fail\n"); + rc = sysfs_emit(buf, "fail\n"); else - rc = snprintf(buf, PAGE_SIZE, "ignore\n"); + rc = sysfs_emit(buf, "ignore\n"); spin_unlock(&dasd_devmap_lock); } return rc; @@ -1457,14 +1457,14 @@ static ssize_t dasd_reservation_state_show(struct device *dev, device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) - return snprintf(buf, PAGE_SIZE, "none\n"); + return sysfs_emit(buf, "none\n"); if (test_bit(DASD_FLAG_IS_RESERVED, &device->flags)) - rc = snprintf(buf, PAGE_SIZE, "reserved\n"); + rc = sysfs_emit(buf, "reserved\n"); else if (test_bit(DASD_FLAG_LOCK_STOLEN, &device->flags)) - rc = snprintf(buf, PAGE_SIZE, "lost\n"); + rc = sysfs_emit(buf, "lost\n"); else - rc = snprintf(buf, PAGE_SIZE, "none\n"); + rc = sysfs_emit(buf, "none\n"); dasd_put_device(device); return rc; } @@ -1531,7 +1531,7 @@ dasd_path_threshold_show(struct device *dev, device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) return -ENODEV; - len = snprintf(buf, PAGE_SIZE, "%lu\n", device->path_thrhld); + len = sysfs_emit(buf, "%lu\n", device->path_thrhld); dasd_put_device(device); return len; } @@ -1578,7 +1578,7 @@ dasd_path_autodisable_show(struct device *dev, else flag = (DASD_FEATURE_DEFAULT & DASD_FEATURE_PATH_AUTODISABLE) != 0; - return snprintf(buf, PAGE_SIZE, flag ? "1\n" : "0\n"); + return sysfs_emit(buf, flag ? "1\n" : "0\n"); } static ssize_t @@ -1616,7 +1616,7 @@ dasd_path_interval_show(struct device *dev, device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) return -ENODEV; - len = snprintf(buf, PAGE_SIZE, "%lu\n", device->path_interval); + len = sysfs_emit(buf, "%lu\n", device->path_interval); dasd_put_device(device); return len; } @@ -1662,9 +1662,9 @@ dasd_device_fcs_show(struct device *dev, struct device_attribute *attr, return -ENODEV; fc_sec = dasd_path_get_fcs_device(device); if (fc_sec == -EINVAL) - rc = snprintf(buf, PAGE_SIZE, "Inconsistent\n"); + rc = sysfs_emit(buf, "Inconsistent\n"); else - rc = snprintf(buf, PAGE_SIZE, "%s\n", dasd_path_get_fcs_str(fc_sec)); + rc = sysfs_emit(buf, "%s\n", dasd_path_get_fcs_str(fc_sec)); dasd_put_device(device); return rc; @@ -1677,7 +1677,7 @@ dasd_path_fcs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) struct dasd_path *path = to_dasd_path(kobj); unsigned int fc_sec = path->fc_security; - return snprintf(buf, PAGE_SIZE, "%s\n", dasd_path_get_fcs_str(fc_sec)); + return sysfs_emit(buf, "%s\n", dasd_path_get_fcs_str(fc_sec)); } static struct kobj_attribute path_fcs_attribute = @@ -1698,7 +1698,7 @@ static ssize_t dasd_##_name##_show(struct device *dev, \ val = _func(device); \ dasd_put_device(device); \ \ - return snprintf(buf, PAGE_SIZE, "%d\n", val); \ + return sysfs_emit(buf, "%d\n", val); \ } \ static DEVICE_ATTR(_name, 0444, dasd_##_name##_show, NULL); \ diff --git a/drivers/s390/char/raw3270.c b/drivers/s390/char/raw3270.c index 646ec796bb83..dfde0d941c3c 100644 --- a/drivers/s390/char/raw3270.c +++ b/drivers/s390/char/raw3270.c @@ -1047,24 +1047,24 @@ raw3270_probe (struct ccw_device *cdev) static ssize_t raw3270_model_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%i\n", - ((struct raw3270 *) dev_get_drvdata(dev))->model); + return sysfs_emit(buf, "%i\n", + ((struct raw3270 *)dev_get_drvdata(dev))->model); } static DEVICE_ATTR(model, 0444, raw3270_model_show, NULL); static ssize_t raw3270_rows_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%i\n", - ((struct raw3270 *) dev_get_drvdata(dev))->rows); + return sysfs_emit(buf, "%i\n", + ((struct raw3270 *)dev_get_drvdata(dev))->rows); } static DEVICE_ATTR(rows, 0444, raw3270_rows_show, NULL); static ssize_t raw3270_columns_show(struct device *dev, struct device_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%i\n", - ((struct raw3270 *) dev_get_drvdata(dev))->cols); + return sysfs_emit(buf, "%i\n", + ((struct raw3270 *)dev_get_drvdata(dev))->cols); } static DEVICE_ATTR(columns, 0444, raw3270_columns_show, NULL); diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 1097e76982a5..5440f285f349 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -285,7 +285,7 @@ static ssize_t chp_configure_show(struct device *dev, if (status < 0) return status; - return snprintf(buf, PAGE_SIZE, "%d\n", status); + return sysfs_emit(buf, "%d\n", status); } static int cfg_wait_idle(void); From 7b737adc10d269e7fdf714ae2caa2281b6a801cf Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 8 Nov 2021 01:21:11 +0900 Subject: [PATCH 134/336] s390/vdso: remove -nostdlib compiler flag The -nostdlib option requests the compiler to not use the standard system startup files or libraries when linking. It is effective only when $(CC) is used as a linker driver. Since commit 2b2a25845d53 ("s390/vdso: Use $(LD) instead of $(CC) to link vDSO"), $(LD) is directly used, hence -nostdlib is unneeded. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20211107162111.323701-1-masahiroy@kernel.org Signed-off-by: Heiko Carstens --- arch/s390/kernel/vdso32/Makefile | 2 +- arch/s390/kernel/vdso64/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index e3e6ac5686df..245bddfe9bc0 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -22,7 +22,7 @@ KBUILD_AFLAGS_32 += -m31 -s KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin -LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \ +LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \ --hash-style=both --build-id=sha1 -melf_s390 -T $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 6568de236701..e7d911780935 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -25,7 +25,7 @@ KBUILD_AFLAGS_64 += -m64 -s KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin -ldflags-y := -fPIC -shared -nostdlib -soname=linux-vdso64.so.1 \ +ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \ --hash-style=both --build-id=sha1 -T $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) From 00b55eaf45549ce26424224d069a091c7e5d8bac Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 11 Nov 2021 10:58:26 +0100 Subject: [PATCH 135/336] s390/vdso: filter out -mstack-guard and -mstack-size When CONFIG_VMAP_STACK is disabled, the user can enable CONFIG_STACK_CHECK, which adds a stack overflow check to each C function in the kernel. This is also done for functions in the vdso page. These functions are run in user context and user stack sizes are usually different to what the kernel uses. This might trigger the stack check although the stack size is valid. Therefore filter the -mstack-guard and -mstack-size flags when compiling vdso C files. Cc: stable@kernel.org # 5.10+ Fixes: 4bff8cb54502 ("s390: convert to GENERIC_VDSO") Reported-by: Janosch Frank Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/Makefile | 10 ++++++---- arch/s390/kernel/vdso64/Makefile | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 69c45f600273..609e3697324b 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -77,10 +77,12 @@ KBUILD_AFLAGS_DECOMPRESSOR += $(aflags-y) KBUILD_CFLAGS_DECOMPRESSOR += $(cflags-y) ifneq ($(call cc-option,-mstack-size=8192 -mstack-guard=128),) -cflags-$(CONFIG_CHECK_STACK) += -mstack-size=$(STACK_SIZE) -ifeq ($(call cc-option,-mstack-size=8192),) -cflags-$(CONFIG_CHECK_STACK) += -mstack-guard=$(CONFIG_STACK_GUARD) -endif + CC_FLAGS_CHECK_STACK := -mstack-size=$(STACK_SIZE) + ifeq ($(call cc-option,-mstack-size=8192),) + CC_FLAGS_CHECK_STACK += -mstack-guard=$(CONFIG_STACK_GUARD) + endif + export CC_FLAGS_CHECK_STACK + cflags-$(CONFIG_CHECK_STACK) += $(CC_FLAGS_CHECK_STACK) endif ifdef CONFIG_EXPOLINE diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index e7d911780935..9e2b95a222a9 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -8,8 +8,9 @@ ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT include $(srctree)/lib/vdso/Makefile obj-vdso64 = vdso_user_wrapper.o note.o obj-cvdso64 = vdso64_generic.o getcpu.o -CFLAGS_REMOVE_getcpu.o = -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) -CFLAGS_REMOVE_vdso64_generic.o = -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) +VDSO_CFLAGS_REMOVE := -pg $(CC_FLAGS_FTRACE) $(CC_FLAGS_EXPOLINE) $(CC_FLAGS_CHECK_STACK) +CFLAGS_REMOVE_getcpu.o = $(VDSO_CFLAGS_REMOVE) +CFLAGS_REMOVE_vdso64_generic.o = $(VDSO_CFLAGS_REMOVE) # Build rules From 6c122360cf2f4c5a856fcbd79b4485b7baec942a Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 1 Nov 2021 14:38:18 +0100 Subject: [PATCH 136/336] s390: wire up sys_futex_waitv system call Tested with futex kselftests. Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/kernel/syscalls/syscall.tbl | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index df5261e5cfe1..ed9c5c2eafad 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -451,3 +451,4 @@ 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv sys_futex_waitv From 099f896f498a2b26d84f4ddae039b2c542c18b48 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 14 Nov 2021 20:40:06 -0800 Subject: [PATCH 137/336] udp: Validate checksum in udp_read_sock() It turns out the skb's in sock receive queue could have bad checksums, as both ->poll() and ->recvmsg() validate checksums. We have to do the same for ->read_sock() path too before they are redirected in sockmap. Fixes: d7f571188ecf ("udp: Implement ->read_sock() for sockmap") Reported-by: Daniel Borkmann Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20211115044006.26068-1-xiyou.wangcong@gmail.com --- net/ipv4/udp.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 319dd7bbfe33..8bcecdd6aeda 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1807,6 +1807,17 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, skb = skb_recv_udp(sk, 0, 1, &err); if (!skb) return err; + + if (udp_lib_checksum_complete(skb)) { + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + continue; + } + used = recv_actor(desc, skb, 0, skb->len); if (used <= 0) { if (!copied) From 7c4de881f7eba40088241f1c3ff5ca28c4e7fa48 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:22 +0000 Subject: [PATCH 138/336] KVM: selftests: Add event channel upcall support to xen_shinfo_test When I first looked at this, there was no support for guest exception handling in the KVM selftests. In fact it was merged into 5.10 before the Xen support got merged in 5.11, and I could have used it from the start. Hook it up now, to exercise the Xen upcall delivery. I'm about to make things a bit more interesting by handling the full 2level event channel stuff in-kernel on top of the basic vector injection that we already have, and I'll want to build more tests on top. Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-3-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/xen_shinfo_test.c | 75 ++++++++++++++++--- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index eda0d2a51224..a0699f00b3d6 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -24,8 +24,12 @@ #define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) #define RUNSTATE_ADDR (SHINFO_REGION_GPA + PAGE_SIZE + 0x20) +#define VCPU_INFO_ADDR (SHINFO_REGION_GPA + 0x40) #define RUNSTATE_VADDR (SHINFO_REGION_GVA + PAGE_SIZE + 0x20) +#define VCPU_INFO_VADDR (SHINFO_REGION_GVA + 0x40) + +#define EVTCHN_VECTOR 0x10 static struct kvm_vm *vm; @@ -56,15 +60,44 @@ struct vcpu_runstate_info { uint64_t time[4]; }; +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ +}; + +struct vcpu_info { + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + unsigned long evtchn_pending_sel; + struct arch_vcpu_info arch; + struct pvclock_vcpu_time_info time; +}; /* 64 bytes (x86) */ + #define RUNSTATE_running 0 #define RUNSTATE_runnable 1 #define RUNSTATE_blocked 2 #define RUNSTATE_offline 3 +static void evtchn_handler(struct ex_regs *regs) +{ + struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; + vi->evtchn_upcall_pending = 0; + + GUEST_SYNC(0x20); +} + static void guest_code(void) { struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR; + __asm__ __volatile__( + "sti\n" + "nop\n" + ); + + /* Trigger an interrupt injection */ + GUEST_SYNC(0); + /* Test having the host set runstates manually */ GUEST_SYNC(RUNSTATE_runnable); GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0); @@ -153,7 +186,7 @@ int main(int argc, char *argv[]) struct kvm_xen_vcpu_attr vi = { .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, - .u.gpa = SHINFO_REGION_GPA + 0x40, + .u.gpa = VCPU_INFO_ADDR, }; vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &vi); @@ -163,6 +196,16 @@ int main(int argc, char *argv[]) }; vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &pvclock); + struct kvm_xen_hvm_attr vec = { + .type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR, + .u.vector = EVTCHN_VECTOR, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler); + if (do_runstate_tests) { struct kvm_xen_vcpu_attr st = { .type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, @@ -171,9 +214,14 @@ int main(int argc, char *argv[]) vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st); } + struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); + vinfo->evtchn_upcall_pending = 0; + struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR); rs->state = 0x5a; + bool evtchn_irq_expected = false; + for (;;) { volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); struct ucall uc; @@ -193,16 +241,21 @@ int main(int argc, char *argv[]) struct kvm_xen_vcpu_attr rst; long rundelay; - /* If no runstate support, bail out early */ - if (!do_runstate_tests) - goto done; - - TEST_ASSERT(rs->state_entry_time == rs->time[0] + - rs->time[1] + rs->time[2] + rs->time[3], - "runstate times don't add up"); + if (do_runstate_tests) + TEST_ASSERT(rs->state_entry_time == rs->time[0] + + rs->time[1] + rs->time[2] + rs->time[3], + "runstate times don't add up"); switch (uc.args[1]) { - case RUNSTATE_running...RUNSTATE_offline: + case 0: + evtchn_irq_expected = true; + vinfo->evtchn_upcall_pending = 1; + break; + + case RUNSTATE_runnable...RUNSTATE_offline: + TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen"); + if (!do_runstate_tests) + goto done; rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT; rst.u.runstate.state = uc.args[1]; vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &rst); @@ -236,6 +289,10 @@ int main(int argc, char *argv[]) sched_yield(); } while (get_run_delay() < rundelay); break; + case 0x20: + TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ"); + evtchn_irq_expected = false; + break; } break; } From 531ca3d6d518f520f67a71e270ac733901896d8f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:02:59 +0000 Subject: [PATCH 139/336] KVM: selftests: Explicitly state indicies for vm_guest_mode_params array Explicitly state the indices when populating vm_guest_mode_params to make it marginally easier to visualize what's going on. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon [Added indices for new guest modes.] Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 041004c0fda7..b624c24290dd 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -187,15 +187,15 @@ const char *vm_guest_mode_string(uint32_t i) } const struct vm_guest_mode_params vm_guest_mode_params[] = { - { 52, 48, 0x1000, 12 }, - { 52, 48, 0x10000, 16 }, - { 48, 48, 0x1000, 12 }, - { 48, 48, 0x10000, 16 }, - { 40, 48, 0x1000, 12 }, - { 40, 48, 0x10000, 16 }, - { 0, 0, 0x1000, 12 }, - { 47, 64, 0x1000, 12 }, - { 44, 64, 0x1000, 12 }, + [VM_MODE_P52V48_4K] = { 52, 48, 0x1000, 12 }, + [VM_MODE_P52V48_64K] = { 52, 48, 0x10000, 16 }, + [VM_MODE_P48V48_4K] = { 48, 48, 0x1000, 12 }, + [VM_MODE_P48V48_64K] = { 48, 48, 0x10000, 16 }, + [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, + [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, + [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, + [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, + [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, }; _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES, "Missing new mode params?"); From c071ff41e1502990a0902c7c3fcf72a462e46330 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:00 +0000 Subject: [PATCH 140/336] KVM: selftests: Expose align() helpers to tests Refactor align() to work with non-pointers and split into separate helpers for aligning up vs. down. Add align_ptr_up() for use with pointers. Expose all helpers so that they can be used by tests and/or other utilities. The align_down() helper in particular will be used to ensure gpa alignment for hugepages. No functional change intended. [Added sepearate up/down helpers and replaced open-coded alignment bit math throughout the KVM selftests.] Signed-off-by: Sean Christopherson Signed-off-by: David Matlack Reviewed-by: Ben Gardon Message-Id: <20211111000310.1435032-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/dirty_log_test.c | 6 ++--- .../testing/selftests/kvm/include/test_util.h | 25 +++++++++++++++++++ .../selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/elf.c | 3 +-- tools/testing/selftests/kvm/lib/kvm_util.c | 13 ++-------- .../selftests/kvm/lib/perf_test_util.c | 4 +-- 6 files changed, 34 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 792c60e1b17d..3fcd89e195c7 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -115,7 +115,7 @@ static void guest_code(void) addr = guest_test_virt_mem; addr += (READ_ONCE(random_array[i]) % guest_num_pages) * guest_page_size; - addr &= ~(host_page_size - 1); + addr = align_down(addr, host_page_size); *(uint64_t *)addr = READ_ONCE(iteration); } @@ -737,14 +737,14 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (!p->phys_offset) { guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * guest_page_size; - guest_test_phys_mem &= ~(host_page_size - 1); + guest_test_phys_mem = align_down(guest_test_phys_mem, host_page_size); } else { guest_test_phys_mem = p->phys_offset; } #ifdef __s390x__ /* Align to 1M (segment size) */ - guest_test_phys_mem &= ~((1 << 20) - 1); + guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20); #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index f8fddc84c0d3..78c06310cc0e 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -117,4 +117,29 @@ static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; } +/* Aligns x up to the next multiple of size. Size must be a power of 2. */ +static inline uint64_t align_up(uint64_t x, uint64_t size) +{ + uint64_t mask = size - 1; + + TEST_ASSERT(size != 0 && !(size & (size - 1)), + "size not a power of 2: %lu", size); + return ((x + mask) & ~mask); +} + +static inline uint64_t align_down(uint64_t x, uint64_t size) +{ + uint64_t x_aligned_up = align_up(x, size); + + if (x == x_aligned_up) + return x; + else + return x_aligned_up - size; +} + +static inline void *align_ptr_up(void *x, size_t size) +{ + return (void *)align_up((unsigned long)x, size); +} + #endif /* SELFTEST_KVM_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 36407cb0ec85..3836322add00 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -280,7 +280,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) #ifdef __s390x__ alignment = max(0x100000, alignment); #endif - guest_test_phys_mem &= ~(alignment - 1); + guest_test_phys_mem = align_down(guest_test_virt_mem, alignment); /* Set up the shared data structure test_args */ test_args.vm = vm; diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c index eac44f5d0db0..13e8e3dcf984 100644 --- a/tools/testing/selftests/kvm/lib/elf.c +++ b/tools/testing/selftests/kvm/lib/elf.c @@ -157,8 +157,7 @@ void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename) "memsize of 0,\n" " phdr index: %u p_memsz: 0x%" PRIx64, n1, (uint64_t) phdr.p_memsz); - vm_vaddr_t seg_vstart = phdr.p_vaddr; - seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1); + vm_vaddr_t seg_vstart = align_down(phdr.p_vaddr, vm->page_size); vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1; seg_vend |= vm->page_size - 1; size_t seg_size = seg_vend - seg_vstart + 1; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b624c24290dd..63375118d48f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -22,15 +22,6 @@ static int vcpu_mmap_sz(void); -/* Aligns x up to the next multiple of size. Size must be a power of 2. */ -static void *align(void *x, size_t size) -{ - size_t mask = size - 1; - TEST_ASSERT(size != 0 && !(size & (size - 1)), - "size not a power of 2: %lu", size); - return (void *) (((size_t) x + mask) & ~mask); -} - /* * Open KVM_DEV_PATH if available, otherwise exit the entire program. * @@ -911,7 +902,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->mmap_start, errno); /* Align host address */ - region->host_mem = align(region->mmap_start, alignment); + region->host_mem = align_ptr_up(region->mmap_start, alignment); /* As needed perform madvise */ if ((src_type == VM_MEM_SRC_ANONYMOUS || @@ -954,7 +945,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, "mmap of alias failed, errno: %i", errno); /* Align host alias address */ - region->host_alias = align(region->mmap_alias, alignment); + region->host_alias = align_ptr_up(region->mmap_alias, alignment); } } diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 0ef80dbdc116..6b8d5020dc54 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -92,10 +92,10 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * perf_test_args.guest_page_size; - guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1); + guest_test_phys_mem = align_down(guest_test_phys_mem, perf_test_args.host_page_size); #ifdef __s390x__ /* Align to 1M (segment size) */ - guest_test_phys_mem &= ~((1 << 20) - 1); + guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20); #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); From f4870ef3e15ab889a689f99a579fe0fe7c53a960 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:01 +0000 Subject: [PATCH 141/336] KVM: selftests: Assert mmap HVA is aligned when using HugeTLB Manually padding and aligning the mmap region is only needed when using THP. When using HugeTLB, mmap will always return an address aligned to the HugeTLB page size. Add a comment to clarify this and assert the mmap behavior for HugeTLB. [Removed requirement that HugeTLB mmaps must be padded per Yanan's feedback and added assertion that mmap returns aligned addresses when using HugeTLB.] Cc: Ben Gardon Cc: Yanan Wang Cc: Andrew Jones Cc: Peter Xu Cc: Aaron Lewis Signed-off-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/test_util.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 11 +++++++++++ tools/testing/selftests/kvm/lib/test_util.c | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 78c06310cc0e..99e0dcdc923f 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -104,6 +104,7 @@ size_t get_trans_hugepagesz(void); size_t get_def_hugetlb_pagesz(void); const struct vm_mem_backing_src_alias *vm_mem_backing_src_alias(uint32_t i); size_t get_backing_src_pagesz(uint32_t i); +bool is_backing_src_hugetlb(uint32_t i); void backing_src_help(const char *flag); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); long get_run_delay(void); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 63375118d48f..07f37456bba0 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -866,6 +866,12 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, alignment = 1; #endif + /* + * When using THP mmap is not guaranteed to returned a hugepage aligned + * address so we have to pad the mmap. Padding is not needed for HugeTLB + * because mmap will always return an address aligned to the HugeTLB + * page size. + */ if (src_type == VM_MEM_SRC_ANONYMOUS_THP) alignment = max(backing_src_pagesz, alignment); @@ -901,6 +907,11 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, "test_malloc failed, mmap_start: %p errno: %i", region->mmap_start, errno); + TEST_ASSERT(!is_backing_src_hugetlb(src_type) || + region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz), + "mmap_start %p is not aligned to HugeTLB page size 0x%lx", + region->mmap_start, backing_src_pagesz); + /* Align host address */ region->host_mem = align_ptr_up(region->mmap_start, alignment); diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index b72429108993..6d23878bbfe1 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -283,6 +283,11 @@ size_t get_backing_src_pagesz(uint32_t i) } } +bool is_backing_src_hugetlb(uint32_t i) +{ + return !!(vm_mem_backing_src_alias(i)->flag & MAP_HUGETLB); +} + static void print_available_backing_src_types(const char *prefix) { int i; From 69cdcfa6f321da2cc1dd2e62fa4a9ee256299b18 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:02 +0000 Subject: [PATCH 142/336] KVM: selftests: Require GPA to be aligned when backed by hugepages Assert that the GPA for a memslot backed by a hugepage is aligned to the hugepage size and fix perf_test_util accordingly. Lack of GPA alignment prevents KVM from backing the guest with hugepages, e.g. x86's write-protection of hugepages when dirty logging is activated is otherwise not exercised. Add a comment explaining that guest_page_size is for non-huge pages to try and avoid confusion about what it actually tracks. Cc: Ben Gardon Cc: Yanan Wang Cc: Andrew Jones Cc: Peter Xu Cc: Aaron Lewis Signed-off-by: Sean Christopherson [Used get_backing_src_pagesz() to determine alignment dynamically.] Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 ++ tools/testing/selftests/kvm/lib/perf_test_util.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 07f37456bba0..1f6a01c33dce 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -875,6 +875,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, if (src_type == VM_MEM_SRC_ANONYMOUS_THP) alignment = max(backing_src_pagesz, alignment); + ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); + /* Add enough memory to align up if necessary */ if (alignment > 1) region->mmap_size += alignment; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 6b8d5020dc54..a015f267d945 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -55,11 +55,16 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, { struct kvm_vm *vm; uint64_t guest_num_pages; + uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); int i; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); perf_test_args.host_page_size = getpagesize(); + /* + * Snapshot the non-huge page size. This is used by the guest code to + * access/dirty pages at the logging granularity. + */ perf_test_args.guest_page_size = vm_guest_mode_params[mode].page_size; guest_num_pages = vm_adjust_num_guest_pages(mode, @@ -92,7 +97,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * perf_test_args.guest_page_size; - guest_test_phys_mem = align_down(guest_test_phys_mem, perf_test_args.host_page_size); + guest_test_phys_mem = align_down(guest_test_phys_mem, backing_src_pagesz); #ifdef __s390x__ /* Align to 1M (segment size) */ guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20); From b65e1051e489be4fe783cb14f1cd33235a0f9803 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:03 +0000 Subject: [PATCH 143/336] KVM: selftests: Use shorthand local var to access struct perf_tests_args Use 'pta' as a local pointer to the global perf_tests_args in order to shorten line lengths and make the code borderline readable. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-6-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/lib/perf_test_util.c | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index a015f267d945..ccdc950c829e 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -24,7 +24,8 @@ static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; */ static void guest_code(uint32_t vcpu_id) { - struct perf_test_vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + struct perf_test_args *pta = &perf_test_args; + struct perf_test_vcpu_args *vcpu_args = &pta->vcpu_args[vcpu_id]; uint64_t gva; uint64_t pages; int i; @@ -37,9 +38,9 @@ static void guest_code(uint32_t vcpu_id) while (true) { for (i = 0; i < pages; i++) { - uint64_t addr = gva + (i * perf_test_args.guest_page_size); + uint64_t addr = gva + (i * pta->guest_page_size); - if (i % perf_test_args.wr_fract == 0) + if (i % pta->wr_fract == 0) *(uint64_t *)addr = 0x0123456789ABCDEF; else READ_ONCE(*(uint64_t *)addr); @@ -53,6 +54,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src) { + struct perf_test_args *pta = &perf_test_args; struct kvm_vm *vm; uint64_t guest_num_pages; uint64_t backing_src_pagesz = get_backing_src_pagesz(backing_src); @@ -60,29 +62,29 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - perf_test_args.host_page_size = getpagesize(); + pta->host_page_size = getpagesize(); /* * Snapshot the non-huge page size. This is used by the guest code to * access/dirty pages at the logging granularity. */ - perf_test_args.guest_page_size = vm_guest_mode_params[mode].page_size; + pta->guest_page_size = vm_guest_mode_params[mode].page_size; guest_num_pages = vm_adjust_num_guest_pages(mode, - (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size); + (vcpus * vcpu_memory_bytes) / pta->guest_page_size); - TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, + TEST_ASSERT(vcpu_memory_bytes % pta->host_page_size == 0, "Guest memory size is not host page size aligned."); - TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, + TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0, "Guest memory size is not guest page size aligned."); TEST_ASSERT(guest_num_pages % slots == 0, "Guest memory cannot be evenly divided into %d slots.", slots); vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, - (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, + (vcpus * vcpu_memory_bytes) / pta->guest_page_size, 0, guest_code, NULL); - perf_test_args.vm = vm; + pta->vm = vm; /* * If there should be more memory in the guest test region than there @@ -96,7 +98,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, vcpu_memory_bytes); guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * - perf_test_args.guest_page_size; + pta->guest_page_size; guest_test_phys_mem = align_down(guest_test_phys_mem, backing_src_pagesz); #ifdef __s390x__ /* Align to 1M (segment size) */ @@ -108,7 +110,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, for (i = 0; i < slots; i++) { uint64_t region_pages = guest_num_pages / slots; vm_paddr_t region_start = guest_test_phys_mem + - region_pages * perf_test_args.guest_page_size * i; + region_pages * pta->guest_page_size * i; vm_userspace_mem_region_add(vm, backing_src, region_start, PERF_TEST_MEM_SLOT_INDEX + i, @@ -133,25 +135,26 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes, bool partition_vcpu_memory_access) { + struct perf_test_args *pta = &perf_test_args; vm_paddr_t vcpu_gpa; struct perf_test_vcpu_args *vcpu_args; int vcpu_id; for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + vcpu_args = &pta->vcpu_args[vcpu_id]; vcpu_args->vcpu_id = vcpu_id; if (partition_vcpu_memory_access) { vcpu_args->gva = guest_test_virt_mem + (vcpu_id * vcpu_memory_bytes); vcpu_args->pages = vcpu_memory_bytes / - perf_test_args.guest_page_size; + pta->guest_page_size; vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); } else { vcpu_args->gva = guest_test_virt_mem; vcpu_args->pages = (vcpus * vcpu_memory_bytes) / - perf_test_args.guest_page_size; + pta->guest_page_size; vcpu_gpa = guest_test_phys_mem; } @@ -159,6 +162,6 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + - (vcpu_args->pages * perf_test_args.guest_page_size)); + (vcpu_args->pages * pta->guest_page_size)); } } From 613d61182fffca6b36ea0df1e44927ccf45b1e9b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:04 +0000 Subject: [PATCH 144/336] KVM: selftests: Capture per-vCPU GPA in perf_test_vcpu_args Capture the per-vCPU GPA in perf_test_vcpu_args so that tests can get the GPA without having to calculate the GPA on their own. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-7-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/perf_test_util.h | 1 + tools/testing/selftests/kvm/lib/perf_test_util.c | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index df9f1a3a3ffb..20aec72fe7b8 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -18,6 +18,7 @@ #define PERF_TEST_MEM_SLOT_INDEX 1 struct perf_test_vcpu_args { + uint64_t gpa; uint64_t gva; uint64_t pages; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index ccdc950c829e..d9c6bcb7964d 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -136,7 +136,6 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, bool partition_vcpu_memory_access) { struct perf_test_args *pta = &perf_test_args; - vm_paddr_t vcpu_gpa; struct perf_test_vcpu_args *vcpu_args; int vcpu_id; @@ -149,19 +148,19 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, (vcpu_id * vcpu_memory_bytes); vcpu_args->pages = vcpu_memory_bytes / pta->guest_page_size; - vcpu_gpa = guest_test_phys_mem + - (vcpu_id * vcpu_memory_bytes); + vcpu_args->gpa = guest_test_phys_mem + + (vcpu_id * vcpu_memory_bytes); } else { vcpu_args->gva = guest_test_virt_mem; vcpu_args->pages = (vcpus * vcpu_memory_bytes) / pta->guest_page_size; - vcpu_gpa = guest_test_phys_mem; + vcpu_args->gpa = guest_test_phys_mem; } vcpu_args_set(vm, vcpu_id, 1, vcpu_id); pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + + vcpu_id, vcpu_args->gpa, vcpu_args->gpa + (vcpu_args->pages * pta->guest_page_size)); } } From 92e34c9974f55519bc0c3386221aadf387162ea6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:05 +0000 Subject: [PATCH 145/336] KVM: selftests: Use perf util's per-vCPU GPA/pages in demand paging test Grab the per-vCPU GPA and number of pages from perf_util in the demand paging test instead of duplicating perf_util's calculations. Note, this may or may not result in a functional change. It's not clear that the test's calculations are guaranteed to yield the same value as perf_util, e.g. if guest_percpu_mem_size != vcpu_args->pages. Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-8-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/demand_paging_test.c | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 1510b21e6306..3c729a0a1ab1 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -322,26 +322,15 @@ static void run_test(enum vm_guest_mode mode, void *arg) TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd"); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - vm_paddr_t vcpu_gpa; + struct perf_test_vcpu_args *vcpu_args; void *vcpu_hva; void *vcpu_alias; - uint64_t vcpu_mem_size; - - if (p->partition_vcpu_memory_access) { - vcpu_gpa = guest_test_phys_mem + - (vcpu_id * guest_percpu_mem_size); - vcpu_mem_size = guest_percpu_mem_size; - } else { - vcpu_gpa = guest_test_phys_mem; - vcpu_mem_size = guest_percpu_mem_size * nr_vcpus; - } - PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size); + vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; /* Cache the host addresses of the region */ - vcpu_hva = addr_gpa2hva(vm, vcpu_gpa); - vcpu_alias = addr_gpa2alias(vm, vcpu_gpa); + vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa); + vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa); /* * Set up user fault fd to handle demand paging @@ -355,7 +344,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pipefds[vcpu_id * 2], p->uffd_mode, p->uffd_delay, &uffd_args[vcpu_id], vcpu_hva, vcpu_alias, - vcpu_mem_size); + vcpu_args->pages * perf_test_args.guest_page_size); } } From b91b637f4a595c5be435e215f78b1a3bd8c252b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:06 +0000 Subject: [PATCH 146/336] KVM: selftests: Move per-VM GPA into perf_test_args Move the per-VM GPA into perf_test_args instead of storing it as a separate global variable. It's not obvious that guest_test_phys_mem holds a GPA, nor that it's connected/coupled with per_vcpu->gpa. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-9-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/perf_test_util.h | 8 +------ .../selftests/kvm/lib/perf_test_util.c | 21 +++++++------------ .../kvm/memslot_modification_stress_test.c | 2 +- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 20aec72fe7b8..d7cde1ab2a85 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -29,6 +29,7 @@ struct perf_test_vcpu_args { struct perf_test_args { struct kvm_vm *vm; uint64_t host_page_size; + uint64_t gpa; uint64_t guest_page_size; int wr_fract; @@ -37,13 +38,6 @@ struct perf_test_args { extern struct perf_test_args perf_test_args; -/* - * Guest physical memory offset of the testing memory slot. - * This will be set to the topmost valid physical address minus - * the test memory size. - */ -extern uint64_t guest_test_phys_mem; - struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src); diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index d9c6bcb7964d..0fc2d834c1c7 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -10,8 +10,6 @@ struct perf_test_args perf_test_args; -uint64_t guest_test_phys_mem; - /* * Guest virtual memory offset of the testing memory slot. * Must not conflict with identity mapped test code. @@ -97,20 +95,18 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, guest_num_pages, vm_get_max_gfn(vm), vcpus, vcpu_memory_bytes); - guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * - pta->guest_page_size; - guest_test_phys_mem = align_down(guest_test_phys_mem, backing_src_pagesz); + pta->gpa = (vm_get_max_gfn(vm) - guest_num_pages) * pta->guest_page_size; + pta->gpa = align_down(pta->gpa, backing_src_pagesz); #ifdef __s390x__ /* Align to 1M (segment size) */ - guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20); + pta->gpa = align_down(pta->gpa, 1 << 20); #endif - pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + pr_info("guest physical test memory offset: 0x%lx\n", pta->gpa); /* Add extra memory slots for testing */ for (i = 0; i < slots; i++) { uint64_t region_pages = guest_num_pages / slots; - vm_paddr_t region_start = guest_test_phys_mem + - region_pages * pta->guest_page_size * i; + vm_paddr_t region_start = pta->gpa + region_pages * pta->guest_page_size * i; vm_userspace_mem_region_add(vm, backing_src, region_start, PERF_TEST_MEM_SLOT_INDEX + i, @@ -118,7 +114,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, } /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); + virt_map(vm, guest_test_virt_mem, pta->gpa, guest_num_pages); ucall_init(vm, NULL); @@ -148,13 +144,12 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, (vcpu_id * vcpu_memory_bytes); vcpu_args->pages = vcpu_memory_bytes / pta->guest_page_size; - vcpu_args->gpa = guest_test_phys_mem + - (vcpu_id * vcpu_memory_bytes); + vcpu_args->gpa = pta->gpa + (vcpu_id * vcpu_memory_bytes); } else { vcpu_args->gva = guest_test_virt_mem; vcpu_args->pages = (vcpus * vcpu_memory_bytes) / pta->guest_page_size; - vcpu_args->gpa = guest_test_phys_mem; + vcpu_args->gpa = pta->gpa; } vcpu_args_set(vm, vcpu_id, 1, vcpu_id); diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 4cfcafea9f5a..d105180d5e8c 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -80,7 +80,7 @@ static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, * Add the dummy memslot just below the perf_test_util memslot, which is * at the top of the guest physical address space. */ - gpa = guest_test_phys_mem - pages * vm_get_page_size(vm); + gpa = perf_test_args.gpa - pages * vm_get_page_size(vm); for (i = 0; i < nr_modifications; i++) { usleep(delay); From a5ac0fd1b90ae811ba51da6a9928633bddefb792 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:07 +0000 Subject: [PATCH 147/336] KVM: selftests: Remove perf_test_args.host_page_size Remove perf_test_args.host_page_size and instead use getpagesize() so that it's somewhat obvious that, for tests that care about the host page size, they care about the system page size, not the hardware page size, e.g. that the logic is unchanged if hugepages are in play. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-10-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/perf_test_util.h | 1 - tools/testing/selftests/kvm/lib/perf_test_util.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index d7cde1ab2a85..9348580dc5be 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -28,7 +28,6 @@ struct perf_test_vcpu_args { struct perf_test_args { struct kvm_vm *vm; - uint64_t host_page_size; uint64_t gpa; uint64_t guest_page_size; int wr_fract; diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 0fc2d834c1c7..a0aded8cfce3 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -60,7 +60,6 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - pta->host_page_size = getpagesize(); /* * Snapshot the non-huge page size. This is used by the guest code to * access/dirty pages at the logging granularity. @@ -70,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, guest_num_pages = vm_adjust_num_guest_pages(mode, (vcpus * vcpu_memory_bytes) / pta->guest_page_size); - TEST_ASSERT(vcpu_memory_bytes % pta->host_page_size == 0, + TEST_ASSERT(vcpu_memory_bytes % getpagesize() == 0, "Guest memory size is not host page size aligned."); TEST_ASSERT(vcpu_memory_bytes % pta->guest_page_size == 0, "Guest memory size is not guest page size aligned."); From f5e8fe2a92e4923b63d1edd6ed53d9856b6515ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:08 +0000 Subject: [PATCH 148/336] KVM: selftests: Create VM with adjusted number of guest pages for perf tests Use the already computed guest_num_pages when creating the so called extra VM pages for a perf test, and add a comment explaining why the pages are allocated as extra pages. Signed-off-by: Sean Christopherson Reviewed-by: Ben Gardon Signed-off-by: David Matlack Message-Id: <20211111000310.1435032-11-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/perf_test_util.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index a0aded8cfce3..b3154b5b0cfd 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -77,9 +77,13 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, "Guest memory cannot be evenly divided into %d slots.", slots); + /* + * Pass guest_num_pages to populate the page tables for test memory. + * The memory is also added to memslot 0, but that's a benign side + * effect as KVM allows aliasing HVAs in meslots. + */ vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, - (vcpus * vcpu_memory_bytes) / pta->guest_page_size, - 0, guest_code, NULL); + guest_num_pages, 0, guest_code, NULL); pta->vm = vm; From cf1d59300ab27af6a2e96b4882fe3d9a72b32b15 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:09 +0000 Subject: [PATCH 149/336] KVM: selftests: Fill per-vCPU struct during "perf_test" VM creation Fill the per-vCPU args when creating the perf_test VM instead of having the caller do so. This helps ensure that any adjustments to the number of pages (and thus vcpu_memory_bytes) are reflected in the per-VM args. Automatically filling the per-vCPU args will also allow a future patch to do the sync to the guest during creation. Signed-off-by: Sean Christopherson [Updated access_tracking_perf_test as well.] Signed-off-by: David Matlack Reviewed-by: Ben Gardon Message-Id: <20211111000310.1435032-12-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/access_tracking_perf_test.c | 5 +- .../selftests/kvm/demand_paging_test.c | 5 +- .../selftests/kvm/dirty_log_perf_test.c | 6 +- .../selftests/kvm/include/perf_test_util.h | 6 +- .../selftests/kvm/lib/perf_test_util.c | 71 ++++++++++--------- .../kvm/memslot_modification_stress_test.c | 6 +- 6 files changed, 45 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 5d95113c7b7c..fdef6c906388 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -332,10 +332,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpus = params->vcpus; vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1, - params->backing_src); - - perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes, - !overlap_memory_access); + params->backing_src, !overlap_memory_access); vcpu_threads = create_vcpu_threads(vcpus); diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 3c729a0a1ab1..0fee44f5e5ae 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -293,7 +293,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) int r; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, - p->src_type); + p->src_type, p->partition_vcpu_memory_access); perf_test_args.wr_fract = 1; @@ -307,9 +307,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, - p->partition_vcpu_memory_access); - if (p->uffd_mode) { uffd_handler_threads = malloc(nr_vcpus * sizeof(*uffd_handler_threads)); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 7ffab5bd5ce5..62f9cc2a3146 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -186,7 +186,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec clear_dirty_log_total = (struct timespec){0}; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - p->slots, p->backing_src); + p->slots, p->backing_src, + p->partition_vcpu_memory_access); perf_test_args.wr_fract = p->wr_fract; @@ -206,9 +207,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, - p->partition_vcpu_memory_access); - sync_global_to_guest(vm, perf_test_args); /* Start the iterations */ diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 9348580dc5be..91804be1cf53 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -39,10 +39,8 @@ extern struct perf_test_args perf_test_args; struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, uint64_t vcpu_memory_bytes, int slots, - enum vm_mem_backing_src_type backing_src); + enum vm_mem_backing_src_type backing_src, + bool partition_vcpu_memory_access); void perf_test_destroy_vm(struct kvm_vm *vm); -void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, - uint64_t vcpu_memory_bytes, - bool partition_vcpu_memory_access); #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index b3154b5b0cfd..13c8bc22f4e1 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -48,9 +48,43 @@ static void guest_code(uint32_t vcpu_id) } } +void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, + uint64_t vcpu_memory_bytes, + bool partition_vcpu_memory_access) +{ + struct perf_test_args *pta = &perf_test_args; + struct perf_test_vcpu_args *vcpu_args; + int vcpu_id; + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + vcpu_args = &pta->vcpu_args[vcpu_id]; + + vcpu_args->vcpu_id = vcpu_id; + if (partition_vcpu_memory_access) { + vcpu_args->gva = guest_test_virt_mem + + (vcpu_id * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + pta->guest_page_size; + vcpu_args->gpa = pta->gpa + (vcpu_id * vcpu_memory_bytes); + } else { + vcpu_args->gva = guest_test_virt_mem; + vcpu_args->pages = (vcpus * vcpu_memory_bytes) / + pta->guest_page_size; + vcpu_args->gpa = pta->gpa; + } + + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_args->gpa, vcpu_args->gpa + + (vcpu_args->pages * pta->guest_page_size)); + } +} + struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, uint64_t vcpu_memory_bytes, int slots, - enum vm_mem_backing_src_type backing_src) + enum vm_mem_backing_src_type backing_src, + bool partition_vcpu_memory_access) { struct perf_test_args *pta = &perf_test_args; struct kvm_vm *vm; @@ -119,6 +153,8 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, /* Do mapping for the demand paging memory slot */ virt_map(vm, guest_test_virt_mem, pta->gpa, guest_num_pages); + perf_test_setup_vcpus(vm, vcpus, vcpu_memory_bytes, partition_vcpu_memory_access); + ucall_init(vm, NULL); return vm; @@ -129,36 +165,3 @@ void perf_test_destroy_vm(struct kvm_vm *vm) ucall_uninit(vm); kvm_vm_free(vm); } - -void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, - uint64_t vcpu_memory_bytes, - bool partition_vcpu_memory_access) -{ - struct perf_test_args *pta = &perf_test_args; - struct perf_test_vcpu_args *vcpu_args; - int vcpu_id; - - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - vcpu_args = &pta->vcpu_args[vcpu_id]; - - vcpu_args->vcpu_id = vcpu_id; - if (partition_vcpu_memory_access) { - vcpu_args->gva = guest_test_virt_mem + - (vcpu_id * vcpu_memory_bytes); - vcpu_args->pages = vcpu_memory_bytes / - pta->guest_page_size; - vcpu_args->gpa = pta->gpa + (vcpu_id * vcpu_memory_bytes); - } else { - vcpu_args->gva = guest_test_virt_mem; - vcpu_args->pages = (vcpus * vcpu_memory_bytes) / - pta->guest_page_size; - vcpu_args->gpa = pta->gpa; - } - - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); - - pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_args->gpa, vcpu_args->gpa + - (vcpu_args->pages * pta->guest_page_size)); - } -} diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index d105180d5e8c..27af0bb8deb7 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -105,16 +105,14 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpu_id; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, - VM_MEM_SRC_ANONYMOUS); + VM_MEM_SRC_ANONYMOUS, + p->partition_vcpu_memory_access); perf_test_args.wr_fract = 1; vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size, - p->partition_vcpu_memory_access); - /* Export the shared variables to the guest */ sync_global_to_guest(vm, perf_test_args); From 13bbc70329c8df003e64c4fbea8678f9db0e75d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 00:03:10 +0000 Subject: [PATCH 150/336] KVM: selftests: Sync perf_test_args to guest during VM creation Copy perf_test_args to the guest during VM creation instead of relying on the caller to do so at their leisure. Ideally, tests wouldn't even be able to modify perf_test_args, i.e. they would have no motivation to do the sync, but enforcing that is arguably a net negative for readability. No functional change intended. [Set wr_fract=1 by default and add helper to override it since the new access_tracking_perf_test needs to set it dynamically.] Signed-off-by: Sean Christopherson Signed-off-by: David Matlack Reviewed-by: Ben Gardon Message-Id: <20211111000310.1435032-13-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/access_tracking_perf_test.c | 3 +-- tools/testing/selftests/kvm/demand_paging_test.c | 5 ----- tools/testing/selftests/kvm/dirty_log_perf_test.c | 4 +--- tools/testing/selftests/kvm/include/perf_test_util.h | 2 ++ tools/testing/selftests/kvm/lib/perf_test_util.c | 12 ++++++++++++ .../selftests/kvm/memslot_modification_stress_test.c | 5 ----- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index fdef6c906388..5364a2ed7c68 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -277,8 +277,7 @@ static void run_iteration(struct kvm_vm *vm, int vcpus, const char *description) static void access_memory(struct kvm_vm *vm, int vcpus, enum access_type access, const char *description) { - perf_test_args.wr_fract = (access == ACCESS_READ) ? INT_MAX : 1; - sync_global_to_guest(vm, perf_test_args); + perf_test_set_wr_fract(vm, (access == ACCESS_READ) ? INT_MAX : 1); iteration_work = ITERATION_ACCESS_MEMORY; run_iteration(vm, vcpus, description); } diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 0fee44f5e5ae..26f8fd8a57ec 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -295,8 +295,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type, p->partition_vcpu_memory_access); - perf_test_args.wr_fract = 1; - demand_paging_size = get_backing_src_pagesz(p->src_type); guest_data_prototype = malloc(demand_paging_size); @@ -345,9 +343,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) } } - /* Export the shared variables to the guest */ - sync_global_to_guest(vm, perf_test_args); - pr_info("Finished creating vCPUs and starting uffd threads\n"); clock_gettime(CLOCK_MONOTONIC, &start); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 62f9cc2a3146..583b4d95aa98 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -189,7 +189,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) p->slots, p->backing_src, p->partition_vcpu_memory_access); - perf_test_args.wr_fract = p->wr_fract; + perf_test_set_wr_fract(vm, p->wr_fract); guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); @@ -207,8 +207,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - sync_global_to_guest(vm, perf_test_args); - /* Start the iterations */ iteration = 0; host_quit = false; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 91804be1cf53..74e3622b3a6e 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -43,4 +43,6 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, bool partition_vcpu_memory_access); void perf_test_destroy_vm(struct kvm_vm *vm); +void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract); + #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 13c8bc22f4e1..77f9eb5667c9 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -94,6 +94,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + /* By default vCPUs will write to memory. */ + pta->wr_fract = 1; + /* * Snapshot the non-huge page size. This is used by the guest code to * access/dirty pages at the logging granularity. @@ -157,6 +160,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, ucall_init(vm, NULL); + /* Export the shared variables to the guest. */ + sync_global_to_guest(vm, perf_test_args); + return vm; } @@ -165,3 +171,9 @@ void perf_test_destroy_vm(struct kvm_vm *vm) ucall_uninit(vm); kvm_vm_free(vm); } + +void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract) +{ + perf_test_args.wr_fract = wr_fract; + sync_global_to_guest(vm, perf_test_args); +} diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 27af0bb8deb7..df431d0da1ee 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -108,14 +108,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) VM_MEM_SRC_ANONYMOUS, p->partition_vcpu_memory_access); - perf_test_args.wr_fract = 1; - vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - /* Export the shared variables to the guest */ - sync_global_to_guest(vm, perf_test_args); - pr_info("Finished creating vCPUs\n"); for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) From 36c5ad73d7016f34146cf0821c78f08737bdb5e9 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 11 Nov 2021 00:12:54 +0000 Subject: [PATCH 151/336] KVM: selftests: Start at iteration 0 instead of -1 Start at iteration 0 instead of -1 to avoid having to initialize vcpu_last_completed_iteration when setting up vCPU threads. This simplifies the next commit where we move vCPU thread initialization out to a common helper. No functional change intended. Signed-off-by: David Matlack Message-Id: <20211111001257.1446428-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/access_tracking_perf_test.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 5364a2ed7c68..7f25a06e19c9 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -47,7 +47,7 @@ #include "guest_modes.h" /* Global variable used to synchronize all of the vCPU threads. */ -static int iteration = -1; +static int iteration; /* Defines what vCPU threads should do during a given iteration. */ static enum { @@ -220,7 +220,7 @@ static void *vcpu_thread_main(void *arg) struct perf_test_vcpu_args *vcpu_args = arg; struct kvm_vm *vm = perf_test_args.vm; int vcpu_id = vcpu_args->vcpu_id; - int current_iteration = -1; + int current_iteration = 0; while (spin_wait_for_next_iteration(¤t_iteration)) { switch (READ_ONCE(iteration_work)) { @@ -303,11 +303,9 @@ static pthread_t *create_vcpu_threads(int vcpus) vcpu_threads = malloc(vcpus * sizeof(vcpu_threads[0])); TEST_ASSERT(vcpu_threads, "Failed to allocate vcpu_threads."); - for (i = 0; i < vcpus; i++) { - vcpu_last_completed_iteration[i] = iteration; + for (i = 0; i < vcpus; i++) pthread_create(&vcpu_threads[i], NULL, vcpu_thread_main, &perf_test_args.vcpu_args[i]); - } return vcpu_threads; } From 81bcb26172a8f00840e0ca44277272dcb673887a Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 11 Nov 2021 00:12:55 +0000 Subject: [PATCH 152/336] KVM: selftests: Move vCPU thread creation and joining to common helpers Move vCPU thread creation and joining to common helper functions. This is in preparation for the next commit which ensures that all vCPU threads are fully created before entering guest mode on any one vCPU. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Ben Gardon Message-Id: <20211111001257.1446428-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/access_tracking_perf_test.c | 40 +++------------- .../selftests/kvm/demand_paging_test.c | 25 ++-------- .../selftests/kvm/dirty_log_perf_test.c | 19 ++------ .../selftests/kvm/include/perf_test_util.h | 5 ++ .../selftests/kvm/lib/perf_test_util.c | 46 +++++++++++++++++++ .../kvm/memslot_modification_stress_test.c | 22 ++------- 6 files changed, 67 insertions(+), 90 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 7f25a06e19c9..d8909032317a 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -215,9 +215,8 @@ static bool spin_wait_for_next_iteration(int *current_iteration) return true; } -static void *vcpu_thread_main(void *arg) +static void vcpu_thread_main(struct perf_test_vcpu_args *vcpu_args) { - struct perf_test_vcpu_args *vcpu_args = arg; struct kvm_vm *vm = perf_test_args.vm; int vcpu_id = vcpu_args->vcpu_id; int current_iteration = 0; @@ -235,8 +234,6 @@ static void *vcpu_thread_main(void *arg) vcpu_last_completed_iteration[vcpu_id] = current_iteration; } - - return NULL; } static void spin_wait_for_vcpu(int vcpu_id, int target_iteration) @@ -295,43 +292,16 @@ static void mark_memory_idle(struct kvm_vm *vm, int vcpus) run_iteration(vm, vcpus, "Mark memory idle"); } -static pthread_t *create_vcpu_threads(int vcpus) -{ - pthread_t *vcpu_threads; - int i; - - vcpu_threads = malloc(vcpus * sizeof(vcpu_threads[0])); - TEST_ASSERT(vcpu_threads, "Failed to allocate vcpu_threads."); - - for (i = 0; i < vcpus; i++) - pthread_create(&vcpu_threads[i], NULL, vcpu_thread_main, - &perf_test_args.vcpu_args[i]); - - return vcpu_threads; -} - -static void terminate_vcpu_threads(pthread_t *vcpu_threads, int vcpus) -{ - int i; - - /* Set done to signal the vCPU threads to exit */ - done = true; - - for (i = 0; i < vcpus; i++) - pthread_join(vcpu_threads[i], NULL); -} - static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *params = arg; struct kvm_vm *vm; - pthread_t *vcpu_threads; int vcpus = params->vcpus; vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1, params->backing_src, !overlap_memory_access); - vcpu_threads = create_vcpu_threads(vcpus); + perf_test_start_vcpu_threads(vcpus, vcpu_thread_main); pr_info("\n"); access_memory(vm, vcpus, ACCESS_WRITE, "Populating memory"); @@ -346,8 +316,10 @@ static void run_test(enum vm_guest_mode mode, void *arg) mark_memory_idle(vm, vcpus); access_memory(vm, vcpus, ACCESS_READ, "Reading from idle memory"); - terminate_vcpu_threads(vcpu_threads, vcpus); - free(vcpu_threads); + /* Set done to signal the vCPU threads to exit */ + done = true; + + perf_test_join_vcpu_threads(vcpus); perf_test_destroy_vm(vm); } diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 26f8fd8a57ec..6a719d065599 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -42,10 +42,9 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; static size_t demand_paging_size; static char *guest_data_prototype; -static void *vcpu_worker(void *data) +static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { int ret; - struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; @@ -68,8 +67,6 @@ static void *vcpu_worker(void *data) ts_diff = timespec_elapsed(start); PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id, ts_diff.tv_sec, ts_diff.tv_nsec); - - return NULL; } static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr) @@ -282,7 +279,6 @@ struct test_params { static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; - pthread_t *vcpu_threads; pthread_t *uffd_handler_threads = NULL; struct uffd_handler_args *uffd_args = NULL; struct timespec start; @@ -302,9 +298,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) "Failed to allocate buffer for guest data pattern"); memset(guest_data_prototype, 0xAB, demand_paging_size); - vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); - TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - if (p->uffd_mode) { uffd_handler_threads = malloc(nr_vcpus * sizeof(*uffd_handler_threads)); @@ -346,22 +339,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) pr_info("Finished creating vCPUs and starting uffd threads\n"); clock_gettime(CLOCK_MONOTONIC, &start); - - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, - &perf_test_args.vcpu_args[vcpu_id]); - } - + perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); pr_info("Started all vCPUs\n"); - /* Wait for the vcpu threads to quit */ - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { - pthread_join(vcpu_threads[vcpu_id], NULL); - PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id); - } - + perf_test_join_vcpu_threads(nr_vcpus); ts_diff = timespec_elapsed(start); - pr_info("All vCPU threads joined\n"); if (p->uffd_mode) { @@ -385,7 +367,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_destroy_vm(vm); free(guest_data_prototype); - free(vcpu_threads); if (p->uffd_mode) { free(uffd_handler_threads); free(uffd_args); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 583b4d95aa98..1954b964d1cf 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -31,7 +31,7 @@ static bool host_quit; static int iteration; static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; -static void *vcpu_worker(void *data) +static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { int ret; struct kvm_vm *vm = perf_test_args.vm; @@ -41,7 +41,6 @@ static void *vcpu_worker(void *data) struct timespec ts_diff; struct timespec total = (struct timespec){0}; struct timespec avg; - struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; run = vcpu_state(vm, vcpu_id); @@ -83,8 +82,6 @@ static void *vcpu_worker(void *data) pr_debug("\nvCPU %d dirtied 0x%lx pages over %d iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id], total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec); - - return NULL; } struct test_params { @@ -170,7 +167,6 @@ static void free_bitmaps(unsigned long *bitmaps[], int slots) static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; - pthread_t *vcpu_threads; struct kvm_vm *vm; unsigned long **bitmaps; uint64_t guest_num_pages; @@ -204,20 +200,15 @@ static void run_test(enum vm_guest_mode mode, void *arg) vm_enable_cap(vm, &cap); } - vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); - TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - /* Start the iterations */ iteration = 0; host_quit = false; clock_gettime(CLOCK_MONOTONIC, &start); - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) { + for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) vcpu_last_completed_iteration[vcpu_id] = -1; - pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, - &perf_test_args.vcpu_args[vcpu_id]); - } + perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); /* Allow the vCPUs to populate memory */ pr_debug("Starting iteration %d - Populating\n", iteration); @@ -286,8 +277,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Tell the vcpu thread to quit */ host_quit = true; - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) - pthread_join(vcpu_threads[vcpu_id], NULL); + perf_test_join_vcpu_threads(nr_vcpus); avg = timespec_div(get_dirty_log_total, p->iterations); pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", @@ -302,7 +292,6 @@ static void run_test(enum vm_guest_mode mode, void *arg) } free_bitmaps(bitmaps, p->slots); - free(vcpu_threads); perf_test_destroy_vm(vm); } diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 74e3622b3a6e..a86f953d8d36 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -8,6 +8,8 @@ #ifndef SELFTEST_KVM_PERF_TEST_UTIL_H #define SELFTEST_KVM_PERF_TEST_UTIL_H +#include + #include "kvm_util.h" /* Default guest test virtual memory offset */ @@ -45,4 +47,7 @@ void perf_test_destroy_vm(struct kvm_vm *vm); void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract); +void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)); +void perf_test_join_vcpu_threads(int vcpus); + #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index 77f9eb5667c9..d646477ed16a 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -16,6 +16,20 @@ struct perf_test_args perf_test_args; */ static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; +struct vcpu_thread { + /* The id of the vCPU. */ + int vcpu_id; + + /* The pthread backing the vCPU. */ + pthread_t thread; +}; + +/* The vCPU threads involved in this test. */ +static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS]; + +/* The function run by each vCPU thread, as provided by the test. */ +static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *); + /* * Continuously write to the first 8 bytes of each page in the * specified region. @@ -177,3 +191,35 @@ void perf_test_set_wr_fract(struct kvm_vm *vm, int wr_fract) perf_test_args.wr_fract = wr_fract; sync_global_to_guest(vm, perf_test_args); } + +static void *vcpu_thread_main(void *data) +{ + struct vcpu_thread *vcpu = data; + + vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]); + + return NULL; +} + +void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vcpu_args *)) +{ + int vcpu_id; + + vcpu_thread_fn = vcpu_fn; + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id]; + + vcpu->vcpu_id = vcpu_id; + + pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu); + } +} + +void perf_test_join_vcpu_threads(int vcpus) +{ + int vcpu_id; + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) + pthread_join(vcpu_threads[vcpu_id].thread, NULL); +} diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index df431d0da1ee..5bd0b076f57f 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -36,11 +36,9 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; static bool run_vcpus = true; -static void *vcpu_worker(void *data) +static void vcpu_worker(struct perf_test_vcpu_args *vcpu_args) { int ret; - struct perf_test_vcpu_args *vcpu_args = - (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; @@ -59,8 +57,6 @@ static void *vcpu_worker(void *data) "Invalid guest sync status: exit_reason=%s\n", exit_reason_str(run->exit_reason)); } - - return NULL; } struct memslot_antagonist_args { @@ -100,22 +96,15 @@ struct test_params { static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; - pthread_t *vcpu_threads; struct kvm_vm *vm; - int vcpu_id; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS, p->partition_vcpu_memory_access); - vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); - TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - pr_info("Finished creating vCPUs\n"); - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) - pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker, - &perf_test_args.vcpu_args[vcpu_id]); + perf_test_start_vcpu_threads(nr_vcpus, vcpu_worker); pr_info("Started all vCPUs\n"); @@ -124,16 +113,11 @@ static void run_test(enum vm_guest_mode mode, void *arg) run_vcpus = false; - /* Wait for the vcpu threads to quit */ - for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) - pthread_join(vcpu_threads[vcpu_id], NULL); - + perf_test_join_vcpu_threads(nr_vcpus); pr_info("All vCPU threads joined\n"); ucall_uninit(vm); kvm_vm_free(vm); - - free(vcpu_threads); } static void help(char *name) From 89d9a43c1d2d3d703fae25c990a1d98dd178dd17 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 11 Nov 2021 00:12:56 +0000 Subject: [PATCH 153/336] KVM: selftests: Wait for all vCPU to be created before entering guest mode Thread creation requires taking the mmap_sem in write mode, which causes vCPU threads running in guest mode to block while they are populating memory. Fix this by waiting for all vCPU threads to be created and start running before entering guest mode on any one vCPU thread. This substantially improves the "Populate memory time" when using 1GiB pages since it allows all vCPUs to zero pages in parallel rather than blocking because a writer is waiting (which is waiting for another vCPU that is busy zeroing a 1GiB page). Before: $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb ... Populate memory time: 52.811184013s After: $ ./dirty_log_perf_test -v256 -s anonymous_hugetlb_1gb ... Populate memory time: 10.204573342s Signed-off-by: David Matlack Message-Id: <20211111001257.1446428-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/lib/perf_test_util.c | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index d646477ed16a..722df3a28791 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -22,6 +22,9 @@ struct vcpu_thread { /* The pthread backing the vCPU. */ pthread_t thread; + + /* Set to true once the vCPU thread is up and running. */ + bool running; }; /* The vCPU threads involved in this test. */ @@ -30,6 +33,9 @@ static struct vcpu_thread vcpu_threads[KVM_MAX_VCPUS]; /* The function run by each vCPU thread, as provided by the test. */ static void (*vcpu_thread_fn)(struct perf_test_vcpu_args *); +/* Set to true once all vCPU threads are up and running. */ +static bool all_vcpu_threads_running; + /* * Continuously write to the first 8 bytes of each page in the * specified region. @@ -196,6 +202,17 @@ static void *vcpu_thread_main(void *data) { struct vcpu_thread *vcpu = data; + WRITE_ONCE(vcpu->running, true); + + /* + * Wait for all vCPU threads to be up and running before calling the test- + * provided vCPU thread function. This prevents thread creation (which + * requires taking the mmap_sem in write mode) from interfering with the + * guest faulting in its memory. + */ + while (!READ_ONCE(all_vcpu_threads_running)) + ; + vcpu_thread_fn(&perf_test_args.vcpu_args[vcpu->vcpu_id]); return NULL; @@ -206,14 +223,23 @@ void perf_test_start_vcpu_threads(int vcpus, void (*vcpu_fn)(struct perf_test_vc int vcpu_id; vcpu_thread_fn = vcpu_fn; + WRITE_ONCE(all_vcpu_threads_running, false); for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { struct vcpu_thread *vcpu = &vcpu_threads[vcpu_id]; vcpu->vcpu_id = vcpu_id; + WRITE_ONCE(vcpu->running, false); pthread_create(&vcpu->thread, NULL, vcpu_thread_main, vcpu); } + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + while (!READ_ONCE(vcpu_threads[vcpu_id].running)) + ; + } + + WRITE_ONCE(all_vcpu_threads_running, true); } void perf_test_join_vcpu_threads(int vcpus) From e2bd936581038f3107c45e8ae32309a567b54bf4 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 11 Nov 2021 00:12:57 +0000 Subject: [PATCH 154/336] KVM: selftests: Use perf_test_destroy_vm in memslot_modification_stress_test Change memslot_modification_stress_test to use perf_test_destroy_vm instead of manually calling ucall_uninit and kvm_vm_free. No functional change intended. Signed-off-by: David Matlack Reviewed-by: Ben Gardon Message-Id: <20211111001257.1446428-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/memslot_modification_stress_test.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 5bd0b076f57f..1410d0a9141a 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -116,8 +116,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_join_vcpu_threads(nr_vcpus); pr_info("All vCPU threads joined\n"); - ucall_uninit(vm); - kvm_vm_free(vm); + perf_test_destroy_vm(vm); } static void help(char *name) From c5adbb3af051079f35abfa26551107e2c653087f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=84=E4=B9=90?= Date: Mon, 15 Nov 2021 14:08:29 +0000 Subject: [PATCH 155/336] KVM: x86: Fix uninitialized eoi_exit_bitmap usage in vcpu_load_eoi_exitmap() In vcpu_load_eoi_exitmap(), currently the eoi_exit_bitmap[4] array is initialized only when Hyper-V context is available, in other path it is just passed to kvm_x86_ops.load_eoi_exitmap() directly from on the stack, which would cause unexpected interrupt delivery/handling issues, e.g. an *old* linux kernel that relies on PIT to do clock calibration on KVM might randomly fail to boot. Fix it by passing ioapic_handled_vectors to load_eoi_exitmap() when Hyper-V context is not available. Fixes: f2bc14b69c38 ("KVM: x86: hyper-v: Prepare to meet unallocated Hyper-V context") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov Signed-off-by: Huang Le Message-Id: <62115b277dab49ea97da5633f8522daf@jd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c479ae57693..2c03b76caf11 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9640,12 +9640,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - if (to_hv_vcpu(vcpu)) + if (to_hv_vcpu(vcpu)) { bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); + static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + return; + } - static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + static_call(kvm_x86_load_eoi_exitmap)( + vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, From 6f019c0e0193411add33c195a226f4d694499f45 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 12 Nov 2021 12:47:30 +0800 Subject: [PATCH 156/336] btrfs: fix a out-of-bound access in copy_compressed_data_to_page() [BUG] The following script can cause btrfs to crash: $ mount -o compress-force=lzo $DEV /mnt $ dd if=/dev/urandom of=/mnt/foo bs=4k count=1 $ sync The call trace looks like this: general protection fault, probably for non-canonical address 0xe04b37fccce3b000: 0000 [#1] PREEMPT SMP NOPTI CPU: 5 PID: 164 Comm: kworker/u20:3 Not tainted 5.15.0-rc7-custom+ #4 Workqueue: btrfs-delalloc btrfs_work_helper [btrfs] RIP: 0010:__memcpy+0x12/0x20 Call Trace: lzo_compress_pages+0x236/0x540 [btrfs] btrfs_compress_pages+0xaa/0xf0 [btrfs] compress_file_range+0x431/0x8e0 [btrfs] async_cow_start+0x12/0x30 [btrfs] btrfs_work_helper+0xf6/0x3e0 [btrfs] process_one_work+0x294/0x5d0 worker_thread+0x55/0x3c0 kthread+0x140/0x170 ret_from_fork+0x22/0x30 ---[ end trace 63c3c0f131e61982 ]--- [CAUSE] In lzo_compress_pages(), parameter @out_pages is not only an output parameter (for the number of compressed pages), but also an input parameter, as the upper limit of compressed pages we can utilize. In commit d4088803f511 ("btrfs: subpage: make lzo_compress_pages() compatible"), the refactoring doesn't take @out_pages as an input, thus completely ignoring the limit. And for compress-force case, we could hit incompressible data that compressed size would go beyond the page limit, and cause the above crash. [FIX] Save @out_pages as @max_nr_page, and pass it to lzo_compress_pages(), and check if we're beyond the limit before accessing the pages. Note: this also fixes crash on 32bit architectures that was suspected to be caused by merge of btrfs patches to 5.16-rc1. Reported in https://lore.kernel.org/all/20211104115001.GU20319@twin.jikos.cz/ . Reported-by: Omar Sandoval Fixes: d4088803f511 ("btrfs: subpage: make lzo_compress_pages() compatible") Reviewed-by: Omar Sandoval Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 00cffc183ec0..f410ceabcdbd 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -125,6 +125,7 @@ static inline size_t read_compress_length(const char *buf) static int copy_compressed_data_to_page(char *compressed_data, size_t compressed_size, struct page **out_pages, + unsigned long max_nr_page, u32 *cur_out, const u32 sectorsize) { @@ -132,6 +133,9 @@ static int copy_compressed_data_to_page(char *compressed_data, u32 orig_out; struct page *cur_page; + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + /* * We never allow a segment header crossing sector boundary, previous * run should ensure we have enough space left inside the sector. @@ -158,6 +162,9 @@ static int copy_compressed_data_to_page(char *compressed_data, u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, orig_out + compressed_size - *cur_out); + if ((*cur_out / PAGE_SIZE) >= max_nr_page) + return -E2BIG; + cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { @@ -195,6 +202,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, struct workspace *workspace = list_entry(ws, struct workspace, list); const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; struct page *page_in = NULL; + const unsigned long max_nr_page = *out_pages; int ret = 0; /* Points to the file offset of input data */ u64 cur_in = start; @@ -202,6 +210,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u32 cur_out = 0; u32 len = *total_out; + ASSERT(max_nr_page > 0); *out_pages = 0; *total_out = 0; *total_in = 0; @@ -237,7 +246,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } ret = copy_compressed_data_to_page(workspace->cbuf, out_len, - pages, &cur_out, sectorsize); + pages, max_nr_page, + &cur_out, sectorsize); if (ret < 0) goto out; From 45da9c1767ac31857df572f0a909fbe88fd5a7e9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 2 Nov 2021 14:49:16 +0200 Subject: [PATCH 157/336] btrfs: fix memory ordering between normal and ordered work functions Ordered work functions aren't guaranteed to be handled by the same thread which executed the normal work functions. The only way execution between normal/ordered functions is synchronized is via the WORK_DONE_BIT, unfortunately the used bitops don't guarantee any ordering whatsoever. This manifested as seemingly inexplicable crashes on ARM64, where async_chunk::inode is seen as non-null in async_cow_submit which causes submit_compressed_extents to be called and crash occurs because async_chunk::inode suddenly became NULL. The call trace was similar to: pc : submit_compressed_extents+0x38/0x3d0 lr : async_cow_submit+0x50/0xd0 sp : ffff800015d4bc20 Call trace: submit_compressed_extents+0x38/0x3d0 async_cow_submit+0x50/0xd0 run_ordered_work+0xc8/0x280 btrfs_work_helper+0x98/0x250 process_one_work+0x1f0/0x4ac worker_thread+0x188/0x504 kthread+0x110/0x114 ret_from_fork+0x10/0x18 Fix this by adding respective barrier calls which ensure that all accesses preceding setting of WORK_DONE_BIT are strictly ordered before setting the flag. At the same time add a read barrier after reading of WORK_DONE_BIT in run_ordered_work which ensures all subsequent loads would be strictly ordered after reading the bit. This in turn ensures are all accesses before WORK_DONE_BIT are going to be strictly ordered before any access that can occur in ordered_func. Reported-by: Chris Murphy Fixes: 08a9ff326418 ("btrfs: Added btrfs_workqueue_struct implemented ordered execution based on kernel workqueue") CC: stable@vger.kernel.org # 4.4+ Link: https://bugzilla.redhat.com/show_bug.cgi?id=2011928 Reviewed-by: Josef Bacik Tested-by: Chris Murphy Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/async-thread.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 309516e6a968..43c89952b7d2 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -234,6 +234,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; + /* + * Orders all subsequent loads after reading WORK_DONE_BIT, + * paired with the smp_mb__before_atomic in btrfs_work_helper + * this guarantees that the ordered function will see all + * updates from ordinary work function. + */ + smp_rmb(); /* * we are going to call the ordered done function, but @@ -317,6 +324,13 @@ static void btrfs_work_helper(struct work_struct *normal_work) thresh_exec_hook(wq); work->func(work); if (need_order) { + /* + * Ensures all memory accesses done in the work function are + * ordered before setting the WORK_DONE_BIT. Ensuring the thread + * which is going to executed the ordered work sees them. + * Pairs with the smp_rmb in run_ordered_work. + */ + smp_mb__before_atomic(); set_bit(WORK_DONE_BIT, &work->flags); run_ordered_work(wq, work); } else { From 4d9380e0da7be2351437cdac71673a9cd94e50fd Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 Nov 2021 12:43:08 +0000 Subject: [PATCH 158/336] btrfs: silence lockdep when reading chunk tree during mount Often some test cases like btrfs/161 trigger lockdep splats that complain about possible unsafe lock scenario due to the fact that during mount, when reading the chunk tree we end up calling blkdev_get_by_path() while holding a read lock on a leaf of the chunk tree. That produces a lockdep splat like the following: [ 3653.683975] ====================================================== [ 3653.685148] WARNING: possible circular locking dependency detected [ 3653.686301] 5.15.0-rc7-btrfs-next-103 #1 Not tainted [ 3653.687239] ------------------------------------------------------ [ 3653.688400] mount/447465 is trying to acquire lock: [ 3653.689320] ffff8c6b0c76e528 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_get_by_dev.part.0+0xe7/0x320 [ 3653.691054] but task is already holding lock: [ 3653.692155] ffff8c6b0a9f39e0 (btrfs-chunk-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 3653.693978] which lock already depends on the new lock. [ 3653.695510] the existing dependency chain (in reverse order) is: [ 3653.696915] -> #3 (btrfs-chunk-00){++++}-{3:3}: [ 3653.698053] down_read_nested+0x4b/0x140 [ 3653.698893] __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 3653.699988] btrfs_read_lock_root_node+0x31/0x40 [btrfs] [ 3653.701205] btrfs_search_slot+0x537/0xc00 [btrfs] [ 3653.702234] btrfs_insert_empty_items+0x32/0x70 [btrfs] [ 3653.703332] btrfs_init_new_device+0x563/0x15b0 [btrfs] [ 3653.704439] btrfs_ioctl+0x2110/0x3530 [btrfs] [ 3653.705405] __x64_sys_ioctl+0x83/0xb0 [ 3653.706215] do_syscall_64+0x3b/0xc0 [ 3653.706990] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 3653.708040] -> #2 (sb_internal#2){.+.+}-{0:0}: [ 3653.708994] lock_release+0x13d/0x4a0 [ 3653.709533] up_write+0x18/0x160 [ 3653.710017] btrfs_sync_file+0x3f3/0x5b0 [btrfs] [ 3653.710699] __loop_update_dio+0xbd/0x170 [loop] [ 3653.711360] lo_ioctl+0x3b1/0x8a0 [loop] [ 3653.711929] block_ioctl+0x48/0x50 [ 3653.712442] __x64_sys_ioctl+0x83/0xb0 [ 3653.712991] do_syscall_64+0x3b/0xc0 [ 3653.713519] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 3653.714233] -> #1 (&lo->lo_mutex){+.+.}-{3:3}: [ 3653.715026] __mutex_lock+0x92/0x900 [ 3653.715648] lo_open+0x28/0x60 [loop] [ 3653.716275] blkdev_get_whole+0x28/0x90 [ 3653.716867] blkdev_get_by_dev.part.0+0x142/0x320 [ 3653.717537] blkdev_open+0x5e/0xa0 [ 3653.718043] do_dentry_open+0x163/0x390 [ 3653.718604] path_openat+0x3f0/0xa80 [ 3653.719128] do_filp_open+0xa9/0x150 [ 3653.719652] do_sys_openat2+0x97/0x160 [ 3653.720197] __x64_sys_openat+0x54/0x90 [ 3653.720766] do_syscall_64+0x3b/0xc0 [ 3653.721285] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 3653.721986] -> #0 (&disk->open_mutex){+.+.}-{3:3}: [ 3653.722775] __lock_acquire+0x130e/0x2210 [ 3653.723348] lock_acquire+0xd7/0x310 [ 3653.723867] __mutex_lock+0x92/0x900 [ 3653.724394] blkdev_get_by_dev.part.0+0xe7/0x320 [ 3653.725041] blkdev_get_by_path+0xb8/0xd0 [ 3653.725614] btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] [ 3653.726332] open_fs_devices+0xd7/0x2c0 [btrfs] [ 3653.726999] btrfs_read_chunk_tree+0x3ad/0x870 [btrfs] [ 3653.727739] open_ctree+0xb8e/0x17bf [btrfs] [ 3653.728384] btrfs_mount_root.cold+0x12/0xde [btrfs] [ 3653.729130] legacy_get_tree+0x30/0x50 [ 3653.729676] vfs_get_tree+0x28/0xc0 [ 3653.730192] vfs_kern_mount.part.0+0x71/0xb0 [ 3653.730800] btrfs_mount+0x11d/0x3a0 [btrfs] [ 3653.731427] legacy_get_tree+0x30/0x50 [ 3653.731970] vfs_get_tree+0x28/0xc0 [ 3653.732486] path_mount+0x2d4/0xbe0 [ 3653.732997] __x64_sys_mount+0x103/0x140 [ 3653.733560] do_syscall_64+0x3b/0xc0 [ 3653.734080] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 3653.734782] other info that might help us debug this: [ 3653.735784] Chain exists of: &disk->open_mutex --> sb_internal#2 --> btrfs-chunk-00 [ 3653.737123] Possible unsafe locking scenario: [ 3653.737865] CPU0 CPU1 [ 3653.738435] ---- ---- [ 3653.739007] lock(btrfs-chunk-00); [ 3653.739449] lock(sb_internal#2); [ 3653.740193] lock(btrfs-chunk-00); [ 3653.740955] lock(&disk->open_mutex); [ 3653.741431] *** DEADLOCK *** [ 3653.742176] 3 locks held by mount/447465: [ 3653.742739] #0: ffff8c6acf85c0e8 (&type->s_umount_key#44/1){+.+.}-{3:3}, at: alloc_super+0xd5/0x3b0 [ 3653.744114] #1: ffffffffc0b28f70 (uuid_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x59/0x870 [btrfs] [ 3653.745563] #2: ffff8c6b0a9f39e0 (btrfs-chunk-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x24/0x110 [btrfs] [ 3653.747066] stack backtrace: [ 3653.747723] CPU: 4 PID: 447465 Comm: mount Not tainted 5.15.0-rc7-btrfs-next-103 #1 [ 3653.748873] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 3653.750592] Call Trace: [ 3653.750967] dump_stack_lvl+0x57/0x72 [ 3653.751526] check_noncircular+0xf3/0x110 [ 3653.752136] ? stack_trace_save+0x4b/0x70 [ 3653.752748] __lock_acquire+0x130e/0x2210 [ 3653.753356] lock_acquire+0xd7/0x310 [ 3653.753898] ? blkdev_get_by_dev.part.0+0xe7/0x320 [ 3653.754596] ? lock_is_held_type+0xe8/0x140 [ 3653.755125] ? blkdev_get_by_dev.part.0+0xe7/0x320 [ 3653.755729] ? blkdev_get_by_dev.part.0+0xe7/0x320 [ 3653.756338] __mutex_lock+0x92/0x900 [ 3653.756794] ? blkdev_get_by_dev.part.0+0xe7/0x320 [ 3653.757400] ? do_raw_spin_unlock+0x4b/0xa0 [ 3653.757930] ? _raw_spin_unlock+0x29/0x40 [ 3653.758437] ? bd_prepare_to_claim+0x129/0x150 [ 3653.758999] ? trace_module_get+0x2b/0xd0 [ 3653.759508] ? try_module_get.part.0+0x50/0x80 [ 3653.760072] blkdev_get_by_dev.part.0+0xe7/0x320 [ 3653.760661] ? devcgroup_check_permission+0xc1/0x1f0 [ 3653.761288] blkdev_get_by_path+0xb8/0xd0 [ 3653.761797] btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] [ 3653.762454] open_fs_devices+0xd7/0x2c0 [btrfs] [ 3653.763055] ? clone_fs_devices+0x8f/0x170 [btrfs] [ 3653.763689] btrfs_read_chunk_tree+0x3ad/0x870 [btrfs] [ 3653.764370] ? kvm_sched_clock_read+0x14/0x40 [ 3653.764922] open_ctree+0xb8e/0x17bf [btrfs] [ 3653.765493] ? super_setup_bdi_name+0x79/0xd0 [ 3653.766043] btrfs_mount_root.cold+0x12/0xde [btrfs] [ 3653.766780] ? rcu_read_lock_sched_held+0x3f/0x80 [ 3653.767488] ? kfree+0x1f2/0x3c0 [ 3653.767979] legacy_get_tree+0x30/0x50 [ 3653.768548] vfs_get_tree+0x28/0xc0 [ 3653.769076] vfs_kern_mount.part.0+0x71/0xb0 [ 3653.769718] btrfs_mount+0x11d/0x3a0 [btrfs] [ 3653.770381] ? rcu_read_lock_sched_held+0x3f/0x80 [ 3653.771086] ? kfree+0x1f2/0x3c0 [ 3653.771574] legacy_get_tree+0x30/0x50 [ 3653.772136] vfs_get_tree+0x28/0xc0 [ 3653.772673] path_mount+0x2d4/0xbe0 [ 3653.773201] __x64_sys_mount+0x103/0x140 [ 3653.773793] do_syscall_64+0x3b/0xc0 [ 3653.774333] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 3653.775094] RIP: 0033:0x7f648bc45aaa This happens because through btrfs_read_chunk_tree(), which is called only during mount, ends up acquiring the mutex open_mutex of a block device while holding a read lock on a leaf of the chunk tree while other paths need to acquire other locks before locking extent buffers of the chunk tree. Since at mount time when we call btrfs_read_chunk_tree() we know that we don't have other tasks running in parallel and modifying the chunk tree, we can simply skip locking of chunk tree extent buffers. So do that and move the assertion that checks the fs is not yet mounted to the top block of btrfs_read_chunk_tree(), with a comment before doing it. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 546bf1146b2d..cc80f2a97a0b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7558,6 +7558,19 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) */ fs_info->fs_devices->total_rw_bytes = 0; + /* + * Lockdep complains about possible circular locking dependency between + * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores + * used for freeze procection of a fs (struct super_block.s_writers), + * which we take when starting a transaction, and extent buffers of the + * chunk tree if we call read_one_dev() while holding a lock on an + * extent buffer of the chunk tree. Since we are mounting the filesystem + * and at this point there can't be any concurrent task modifying the + * chunk tree, to keep it simple, just skip locking on the chunk tree. + */ + ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); + path->skip_locking = 1; + /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id @@ -7583,10 +7596,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) goto error; break; } - /* - * The nodes on level 1 are not locked but we don't need to do - * that during mount time as nothing else can access the tree - */ node = path->nodes[1]; if (node) { if (last_ra_node != node->start) { @@ -7614,7 +7623,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * requirement for chunk allocation, see the comment on * top of btrfs_chunk_alloc() for details. */ - ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); ret = read_one_chunk(&found_key, leaf, chunk); if (ret) From a91cf0ffbc244792e0b3ecf7d0fddb2f344b461f Mon Sep 17 00:00:00 2001 From: Wang Yugui Date: Thu, 28 Oct 2021 06:32:54 +0800 Subject: [PATCH 159/336] btrfs: check-integrity: fix a warning on write caching disabled disk When a disk has write caching disabled, we skip submission of a bio with flush and sync requests before writing the superblock, since it's not needed. However when the integrity checker is enabled, this results in reports that there are metadata blocks referred by a superblock that were not properly flushed. So don't skip the bio submission only when the integrity checker is enabled for the sake of simplicity, since this is a debug tool and not meant for use in non-debug builds. fstests/btrfs/220 trigger a check-integrity warning like the following when CONFIG_BTRFS_FS_CHECK_INTEGRITY=y and the disk with WCE=0. btrfs: attempt to write superblock which references block M @5242880 (sdb2/5242880/0) which is not flushed out of disk's write cache (block flush_gen=1, dev->flush_gen=0)! ------------[ cut here ]------------ WARNING: CPU: 28 PID: 843680 at fs/btrfs/check-integrity.c:2196 btrfsic_process_written_superblock+0x22a/0x2a0 [btrfs] CPU: 28 PID: 843680 Comm: umount Not tainted 5.15.0-0.rc5.39.el8.x86_64 #1 Hardware name: Dell Inc. Precision T7610/0NK70N, BIOS A18 09/11/2019 RIP: 0010:btrfsic_process_written_superblock+0x22a/0x2a0 [btrfs] RSP: 0018:ffffb642afb47940 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 RDX: 00000000ffffffff RSI: ffff8b722fc97d00 RDI: ffff8b722fc97d00 RBP: ffff8b5601c00000 R08: 0000000000000000 R09: c0000000ffff7fff R10: 0000000000000001 R11: ffffb642afb476f8 R12: ffffffffffffffff R13: ffffb642afb47974 R14: ffff8b5499254c00 R15: 0000000000000003 FS: 00007f00a06d4080(0000) GS:ffff8b722fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff5cff5ff0 CR3: 00000001c0c2a006 CR4: 00000000001706e0 Call Trace: btrfsic_process_written_block+0x2f7/0x850 [btrfs] __btrfsic_submit_bio.part.19+0x310/0x330 [btrfs] ? bio_associate_blkg_from_css+0xa4/0x2c0 btrfsic_submit_bio+0x18/0x30 [btrfs] write_dev_supers+0x81/0x2a0 [btrfs] ? find_get_pages_range_tag+0x219/0x280 ? pagevec_lookup_range_tag+0x24/0x30 ? __filemap_fdatawait_range+0x6d/0xf0 ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e ? find_first_extent_bit+0x9b/0x160 [btrfs] ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e write_all_supers+0x1b3/0xa70 [btrfs] ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e btrfs_commit_transaction+0x59d/0xac0 [btrfs] close_ctree+0x11d/0x339 [btrfs] generic_shutdown_super+0x71/0x110 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 exit_to_user_mode_prepare+0x1f0/0x200 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x46/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f009f711dfb RSP: 002b:00007fff5cff7928 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 000055b68c6c9970 RCX: 00007f009f711dfb RDX: 0000000000000001 RSI: 0000000000000000 RDI: 000055b68c6c9b50 RBP: 0000000000000000 R08: 000055b68c6ca900 R09: 00007f009f795580 R10: 0000000000000000 R11: 0000000000000246 R12: 000055b68c6c9b50 R13: 00007f00a04bf184 R14: 0000000000000000 R15: 00000000ffffffff ---[ end trace 2c4b82abcef9eec4 ]--- S-65536(sdb2/65536/1) --> M-1064960(sdb2/1064960/1) Reviewed-by: Filipe Manana Signed-off-by: Wang Yugui Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c7254331cf38..847aabb30676 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3978,11 +3978,23 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio = device->flush_bio; +#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio + * with flush and sync requests before writing the superblock, since + * it's not needed. However when the integrity checker is enabled, this + * results in reports that there are metadata blocks referred by a + * superblock that were not properly flushed. So don't skip the bio + * submission only when the integrity checker is enabled for the sake + * of simplicity, since this is a debug tool and not meant for use in + * non-debug builds. + */ + struct request_queue *q = bdev_get_queue(device->bdev); if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return; +#endif bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; From d08e38b62327961295be1c63b562cd46ec97cd07 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Nov 2021 19:20:08 +0000 Subject: [PATCH 160/336] btrfs: make 1-bit bit-fields of scrub_page unsigned int The bitfields have_csum and io_error are currently signed which is not recommended as the representation is an implementation defined behaviour. Fix this by making the bit-fields unsigned ints. Fixes: 2c36395430b0 ("btrfs: scrub: remove the anonymous structure from scrub_page") Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Colin Ian King Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cf82ea6f54fb..8f6ceea33969 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -73,8 +73,8 @@ struct scrub_page { u64 physical_for_dev_replace; atomic_t refs; u8 mirror_num; - int have_csum:1; - int io_error:1; + unsigned int have_csum:1; + unsigned int io_error:1; u8 csum[BTRFS_CSUM_SIZE]; struct scrub_recover *recover; From 6c405b24097c24cbb11570b47fd382676014f72e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 10 Nov 2021 13:41:04 +0200 Subject: [PATCH 161/336] btrfs: deprecate BTRFS_IOC_BALANCE ioctl The v2 balance ioctl has been introduced more than 9 years ago. Users of the old v1 ioctl should have long been migrated to it. It's time we deprecate it and eventually remove it. The only known user is in btrfs-progs that tries v1 as a fallback in case v2 is not supported. This is not necessary anymore. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 92424a22d8d6..012fbfdfbebf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3986,6 +3986,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bool need_unlock; /* for mut. excl. ops lock */ int ret; + if (!arg) + btrfs_warn(fs_info, + "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); + if (!capable(CAP_SYS_ADMIN)) return -EPERM; From 0226487ad8146433be74d20955448e12d85fd251 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 15 Nov 2021 21:00:08 -0600 Subject: [PATCH 162/336] cifs: move debug print out of spinlock It is better to print debug messages outside of the chan_lock spinlock where possible. Reviewed-by: Shyam Prasad N Addresses-Coverity: 1493854 ("Thread deadlock") Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 2c10b186ed6e..7db8b22edac9 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -95,9 +95,9 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) } if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); ses->chan_max = 1; spin_unlock(&ses->chan_lock); + cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); return 0; } spin_unlock(&ses->chan_lock); From 446e21482e8c10b96d540862c5f6a37009436f00 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 15 Nov 2021 18:02:27 -0600 Subject: [PATCH 163/336] cifs: protect srv_count with cifs_tcp_ses_lock Updates to the srv_count field are protected elsewhere with the cifs_tcp_ses_lock spinlock. Add one missing place (cifs_get_tcp_sesion). CC: Shyam Prasad N Addresses-Coverity: 1494149 ("Data Race Condition") Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/connect.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 82577a7a5bb1..b98711b62897 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1452,8 +1452,10 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->max_in_flight = 0; tcp_ses->credits = 1; if (primary_server) { + spin_lock(&cifs_tcp_ses_lock); ++primary_server->srv_count; tcp_ses->primary_server = primary_server; + spin_unlock(&cifs_tcp_ses_lock); } init_waitqueue_head(&tcp_ses->response_q); init_waitqueue_head(&tcp_ses->request_q); From 8ae87bbeb5d1bfd4ddf2f73f72be51d02d6be2eb Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 16 Nov 2021 13:38:58 -0300 Subject: [PATCH 164/336] cifs: introduce cifs_ses_mark_for_reconnect() helper Use new cifs_ses_mark_for_reconnect() helper to mark all session channels for reconnect instead of duplicating it in different places. Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/cifs_swn.c | 16 ++-------------- fs/cifs/cifsproto.h | 1 + fs/cifs/connect.c | 14 +------------- fs/cifs/dfs_cache.c | 7 +------ fs/cifs/sess.c | 13 +++++++++++++ 5 files changed, 18 insertions(+), 33 deletions(-) diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 12bde7bfda86..23a1ed2fb769 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -393,26 +393,14 @@ static void cifs_put_swn_reg(struct cifs_swn_reg *swnreg) static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const char *name, int state) { - int i; - switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); - for (i = 0; i < swnreg->tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting) - swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } + cifs_ses_mark_for_reconnect(swnreg->tcon->ses); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); - for (i = 0; i < swnreg->tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (swnreg->tcon->ses->chans[i].server->tcpStatus != CifsExiting) - swnreg->tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } + cifs_ses_mark_for_reconnect(swnreg->tcon->ses); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f3073a62ce57..4f5a3e857df4 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -599,6 +599,7 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) bool is_server_using_iface(struct TCP_Server_Info *server, struct cifs_server_iface *iface); bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); +void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b98711b62897..67e4c5548e9d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4113,18 +4113,6 @@ cifs_prune_tlinks(struct work_struct *work) } #ifdef CONFIG_CIFS_DFS_UPCALL -static void mark_tcon_tcp_ses_for_reconnect(struct cifs_tcon *tcon) -{ - int i; - - for (i = 0; i < tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (tcon->ses->chans[i].server->tcpStatus != CifsExiting) - tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } -} - /* Update dfs referral path of superblock */ static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb, const char *target) @@ -4301,7 +4289,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco */ if (rc && server->current_fullpath != server->origin_fullpath) { server->current_fullpath = server->origin_fullpath; - mark_tcon_tcp_ses_for_reconnect(tcon); + cifs_ses_mark_for_reconnect(tcon->ses); } dfs_cache_free_tgts(tl); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 5c1259d2eeac..e9b0fa2a9614 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1355,12 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - for (i = 0; i < tcon->ses->chan_count; i++) { - spin_lock(&GlobalMid_Lock); - if (tcon->ses->chans[i].server->tcpStatus != CifsExiting) - tcon->ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&GlobalMid_Lock); - } + cifs_ses_mark_for_reconnect(tcon->ses); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 7db8b22edac9..8ad2993785af 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -318,6 +318,19 @@ out: return rc; } +/* Mark all session channels for reconnect */ +void cifs_ses_mark_for_reconnect(struct cifs_ses *ses) +{ + int i; + + for (i = 0; i < ses->chan_count; i++) { + spin_lock(&GlobalMid_Lock); + if (ses->chans[i].server->tcpStatus != CifsExiting) + ses->chans[i].server->tcpStatus = CifsNeedReconnect; + spin_unlock(&GlobalMid_Lock); + } +} + static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) { __u32 capabilities = 0; From 83dde7498fefeb920b1def317421262317d178e5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 7 Nov 2021 08:40:47 +0200 Subject: [PATCH 165/336] RDMA/netlink: Add __maybe_unused to static inline in C file Like other commits in the tree add __maybe_unused to a static inline in a C file because some clang compilers will complain about unused code: >> drivers/infiniband/core/nldev.c:2543:1: warning: unused function '__chk_RDMA_NL_NLDEV' MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5); ^ Fixes: e3bf14bdc17a ("rdma: Autoload netlink client modules") Link: https://lore.kernel.org/r/4a8101919b765e01d7fde6f27fd572c958deeb4a.1636267207.git.leonro@nvidia.com Reported-by: kernel test robot Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 2758d9df71ee..c2a79aeee113 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -30,7 +30,7 @@ enum rdma_nl_flags { * constant as well and the compiler checks they are the same. */ #define MODULE_ALIAS_RDMA_NETLINK(_index, _val) \ - static inline void __chk_##_index(void) \ + static inline void __maybe_unused __chk_##_index(void) \ { \ BUILD_BUG_ON(_index != _val); \ } \ From 6cd7397d01c4a3e09757840299e4f114f0aa5fa0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Nov 2021 13:45:00 +0200 Subject: [PATCH 166/336] RDMA/core: Set send and receive CQ before forwarding to the driver Preset both receive and send CQ pointers prior to call to the drivers and overwrite it later again till the mlx4 is going to be changed do not overwrite ibqp properties. This change is needed for mlx5, because in case of QP creation failure, it will go to the path of QP destroy which relies on proper CQ pointers. BUG: KASAN: use-after-free in create_qp.cold+0x164/0x16e [mlx5_ib] Write of size 8 at addr ffff8880064c55c0 by task a.out/246 CPU: 0 PID: 246 Comm: a.out Not tainted 5.15.0+ #291 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x140 kasan_report.cold+0x83/0xdf create_qp.cold+0x164/0x16e [mlx5_ib] mlx5_ib_create_qp+0x358/0x28a0 [mlx5_ib] create_qp.part.0+0x45b/0x6a0 [ib_core] ib_create_qp_user+0x97/0x150 [ib_core] ib_uverbs_handler_UVERBS_METHOD_QP_CREATE+0x92c/0x1250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1c38/0x3150 [ib_uverbs] ib_uverbs_ioctl+0x169/0x260 [ib_uverbs] __x64_sys_ioctl+0x866/0x14d0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Allocated by task 246: kasan_save_stack+0x1b/0x40 __kasan_kmalloc+0xa4/0xd0 create_qp.part.0+0x92/0x6a0 [ib_core] ib_create_qp_user+0x97/0x150 [ib_core] ib_uverbs_handler_UVERBS_METHOD_QP_CREATE+0x92c/0x1250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1c38/0x3150 [ib_uverbs] ib_uverbs_ioctl+0x169/0x260 [ib_uverbs] __x64_sys_ioctl+0x866/0x14d0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 246: kasan_save_stack+0x1b/0x40 kasan_set_track+0x1c/0x30 kasan_set_free_info+0x20/0x30 __kasan_slab_free+0x10c/0x150 slab_free_freelist_hook+0xb4/0x1b0 kfree+0xe7/0x2a0 create_qp.part.0+0x52b/0x6a0 [ib_core] ib_create_qp_user+0x97/0x150 [ib_core] ib_uverbs_handler_UVERBS_METHOD_QP_CREATE+0x92c/0x1250 [ib_uverbs] ib_uverbs_cmd_verbs+0x1c38/0x3150 [ib_uverbs] ib_uverbs_ioctl+0x169/0x260 [ib_uverbs] __x64_sys_ioctl+0x866/0x14d0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 514aee660df4 ("RDMA: Globally allocate and release QP memory") Link: https://lore.kernel.org/r/2dbb2e2cbb1efb188a500e5634be1d71956424ce.1636631035.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 692d5ff657df..c18634bec212 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1232,6 +1232,9 @@ static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); WARN_ONCE(!udata && !caller, "Missing kernel QP owner"); rdma_restrack_set_name(&qp->res, udata ? NULL : caller); From da86dc175b5af1f3e95642cdb536bfa4f7ddb1a9 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Mon, 15 Nov 2021 15:09:13 -0500 Subject: [PATCH 167/336] IB/hfi1: Properly allocate rdma counter desc memory When optional counter support was added the allocation of the memory holding the counter descriptors was not cleared properly. This caused WARN_ON()s in the IB/sysfs code to be hit. This is because the uninitialized memory made some of the counters wrongly look like optional counters. Use kzalloc. While here change the sizeof() calls to use the pointer rather than the name of the type. WARNING: CPU: 0 PID: 32644 at drivers/infiniband/core/sysfs.c:1064 ib_setup_port_attrs+0x7e1/0x890 [ib_core] CPU: 0 PID: 32644 Comm: kworker/0:2 Tainted: G S W 5.15.0+ #36 Hardware name: Intel Corporation S2600WTT/S2600WTT, BIOS SE5C610.86B.01.01.0018.C4.072020161249 07/20/2016 Workqueue: events work_for_cpu_fn RIP: 0010:ib_setup_port_attrs+0x7e1/0x890 [ib_core] RSP: 0018:ffffc90006ea3c40 EFLAGS: 00010202 RAX: 0000000000000068 RBX: ffff888106ad8000 RCX: 0000000000000138 RDX: ffff888126c84c00 RSI: ffff888103c41000 RDI: 0000000000000124 RBP: ffff88810f63a801 R08: ffff888126c8a000 R09: 0000000000000001 R10: ffffffffa09acf20 R11: 0000000000000065 R12: ffff88810f63a800 R13: ffff88810f63a800 R14: ffff88810f63a8e0 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff888667a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005590102cb078 CR3: 000000000240a003 CR4: 00000000001706f0 Call Trace: ib_register_device.cold.44+0x23e/0x2d0 [ib_core] rvt_register_device+0xfa/0x230 [rdmavt] hfi1_register_ib_device+0x623/0x690 [hfi1] init_one.cold.36+0x2d1/0x49b [hfi1] local_pci_probe+0x45/0x80 work_for_cpu_fn+0x16/0x20 process_one_work+0x1b1/0x360 worker_thread+0x1d4/0x3a0 kthread+0x11a/0x140 ret_from_fork+0x22/0x30 Fixes: 5e2ddd1e5982 ("RDMA/counter: Add optional counter support") Link: https://lore.kernel.org/r/20211115200913.124104.47770.stgit@awfm-01.cornelisnetworks.com Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ed9fa0d84e9e..dc9211f3a009 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1628,8 +1628,7 @@ static int init_cntr_names(const char *names_in, const size_t names_len, n++; names_out = - kmalloc((n + num_extra_names) * sizeof(struct rdma_stat_desc) + - names_len, + kzalloc((n + num_extra_names) * sizeof(*q) + names_len, GFP_KERNEL); if (!names_out) { *num_cntrs = 0; @@ -1637,7 +1636,7 @@ static int init_cntr_names(const char *names_in, const size_t names_len, return -ENOMEM; } - p = names_out + (n + num_extra_names) * sizeof(struct rdma_stat_desc); + p = names_out + (n + num_extra_names) * sizeof(*q); memcpy(p, names_in, names_len); q = (struct rdma_stat_desc *)names_out; From 994a04a20b03128838ec0250a0e266aab24d23f1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Nov 2021 12:13:41 +0100 Subject: [PATCH 168/336] thermal: int340x: Limit Kconfig to 64-bit 32-bit processors cannot generally access 64-bit MMIO registers atomically, and it is unknown in which order the two halves of this registers would need to be read: drivers/thermal/intel/int340x_thermal/processor_thermal_mbox.c: In function 'send_mbox_cmd': drivers/thermal/intel/int340x_thermal/processor_thermal_mbox.c:79:37: error: implicit declaration of function 'readq'; did you mean 'readl'? [-Werror=implicit-function-declaration] 79 | *cmd_resp = readq((void __iomem *) (proc_priv->mmio_base + MBOX_OFFSET_DATA)); | ^~~~~ | readl The driver already does not build for anything other than x86, so limit it further to x86-64. Fixes: aeb58c860dc5 ("thermal/drivers/int340x: processor_thermal: Suppot 64 bit RFIM responses") Signed-off-by: Arnd Bergmann Reviewed-by: Srinivas Pandruvada Reviewed-by: Randy Dunlap Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/Kconfig b/drivers/thermal/intel/int340x_thermal/Kconfig index 45c31f3d6054..5d046de96a5d 100644 --- a/drivers/thermal/intel/int340x_thermal/Kconfig +++ b/drivers/thermal/intel/int340x_thermal/Kconfig @@ -5,12 +5,12 @@ config INT340X_THERMAL tristate "ACPI INT340X thermal drivers" - depends on X86 && ACPI && PCI + depends on X86_64 && ACPI && PCI select THERMAL_GOV_USER_SPACE select ACPI_THERMAL_REL select ACPI_FAN select INTEL_SOC_DTS_IOSF_CORE - select PROC_THERMAL_MMIO_RAPL if X86_64 && POWERCAP + select PROC_THERMAL_MMIO_RAPL if POWERCAP help Newer laptops and tablets that use ACPI may have thermal sensors and other devices with thermal control capabilities outside the core From ac5d272a0ad0419f52e08c91953356e32b075af7 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 15 Nov 2021 11:29:04 -0800 Subject: [PATCH 169/336] x86/sgx: Fix free page accounting The SGX driver maintains a single global free page counter, sgx_nr_free_pages, that reflects the number of free pages available across all NUMA nodes. Correspondingly, a list of free pages is associated with each NUMA node and sgx_nr_free_pages is updated every time a page is added or removed from any of the free page lists. The main usage of sgx_nr_free_pages is by the reclaimer that runs when it (sgx_nr_free_pages) goes below a watermark to ensure that there are always some free pages available to, for example, support efficient page faults. With sgx_nr_free_pages accessed and modified from a few places it is essential to ensure that these accesses are done safely but this is not the case. sgx_nr_free_pages is read without any protection and updated with inconsistent protection by any one of the spin locks associated with the individual NUMA nodes. For example: CPU_A CPU_B ----- ----- spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); ... ... sgx_nr_free_pages--; /* NOT SAFE */ sgx_nr_free_pages--; spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); Since sgx_nr_free_pages may be protected by different spin locks while being modified from different CPUs, the following scenario is possible: CPU_A CPU_B ----- ----- {sgx_nr_free_pages = 100} spin_lock(&nodeA->lock); spin_lock(&nodeB->lock); sgx_nr_free_pages--; sgx_nr_free_pages--; /* LOAD sgx_nr_free_pages = 100 */ /* LOAD sgx_nr_free_pages = 100 */ /* sgx_nr_free_pages-- */ /* sgx_nr_free_pages-- */ /* STORE sgx_nr_free_pages = 99 */ /* STORE sgx_nr_free_pages = 99 */ spin_unlock(&nodeA->lock); spin_unlock(&nodeB->lock); In the above scenario, sgx_nr_free_pages is decremented from two CPUs but instead of sgx_nr_free_pages ending with a value that is two less than it started with, it was only decremented by one while the number of free pages were actually reduced by two. The consequence of sgx_nr_free_pages not being protected is that its value may not accurately reflect the actual number of free pages on the system, impacting the availability of free pages in support of many flows. The problematic scenario is when the reclaimer does not run because it believes there to be sufficient free pages while any attempt to allocate a page fails because there are no free pages available. In the SGX driver the reclaimer's watermark is only 32 pages so after encountering the above example scenario 32 times a user space hang is possible when there are no more free pages because of repeated page faults caused by no free pages made available. The following flow was encountered: asm_exc_page_fault ... sgx_vma_fault() sgx_encl_load_page() sgx_encl_eldu() // Encrypted page needs to be loaded from backing // storage into newly allocated SGX memory page sgx_alloc_epc_page() // Allocate a page of SGX memory __sgx_alloc_epc_page() // Fails, no free SGX memory ... if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) // Wake reclaimer wake_up(&ksgxd_waitq); return -EBUSY; // Return -EBUSY giving reclaimer time to run return -EBUSY; return -EBUSY; return VM_FAULT_NOPAGE; The reclaimer is triggered in above flow with the following code: static bool sgx_should_reclaim(unsigned long watermark) { return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } In the problematic scenario there were no free pages available yet the value of sgx_nr_free_pages was above the watermark. The allocation of SGX memory thus always failed because of a lack of free pages while no free pages were made available because the reclaimer is never started because of sgx_nr_free_pages' incorrect value. The consequence was that user space kept encountering VM_FAULT_NOPAGE that caused the same address to be accessed repeatedly with the same result. Change the global free page counter to an atomic type that ensures simultaneous updates are done safely. While doing so, move the updating of the variable outside of the spin lock critical section to which it does not belong. Cc: stable@vger.kernel.org Fixes: 901ddbb9ecf5 ("x86/sgx: Add a basic NUMA allocation scheme to sgx_alloc_epc_page()") Suggested-by: Dave Hansen Signed-off-by: Reinette Chatre Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Acked-by: Jarkko Sakkinen Link: https://lkml.kernel.org/r/a95a40743bbd3f795b465f30922dde7f1ea9e0eb.1637004094.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/sgx/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 63d3de02bbcc..8471a8b9b48e 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); -/* The free page list lock protected variables prepend the lock. */ -static unsigned long sgx_nr_free_pages; +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); /* Nodes with one or more EPC sections. */ static nodemask_t sgx_numa_mask; @@ -403,14 +402,15 @@ skip: spin_lock(&node->lock); list_add_tail(&epc_page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) @@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - sgx_nr_free_pages--; spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); return page; } @@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); list_add_tail(&page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, From 99b63316c39988039965693f5f43d8b4ccb1c86c Mon Sep 17 00:00:00 2001 From: Manaf Meethalavalappu Pallikunhi Date: Wed, 3 Nov 2021 01:30:40 +0530 Subject: [PATCH 170/336] thermal: core: Reset previous low and high trip during thermal zone init During the suspend is in process, thermal_zone_device_update bails out thermal zone re-evaluation for any sensor trip violation without setting next valid trip to that sensor. It assumes during resume it will re-evaluate same thermal zone and update trip. But when it is in suspend temperature goes down and on resume path while updating thermal zone if temperature is less than previously violated trip, thermal zone set trip function evaluates the same previous high and previous low trip as new high and low trip. Since there is no change in high/low trip, it bails out from thermal zone set trip API without setting any trip. It leads to a case where sensor high trip or low trip is disabled forever even though thermal zone has a valid high or low trip. During thermal zone device init, reset thermal zone previous high and low trip. It resolves above mentioned scenario. Signed-off-by: Manaf Meethalavalappu Pallikunhi Reviewed-by: Thara Gopinath Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 648829ab79ff..82654dc8382b 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -421,6 +421,8 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz) { struct thermal_instance *pos; tz->temperature = THERMAL_TEMP_INVALID; + tz->prev_low_trip = -INT_MAX; + tz->prev_high_trip = INT_MAX; list_for_each_entry(pos, &tz->thermal_instances, tz_node) pos->initialized = false; } From cc4a9cc03faa6d8db1a6954bb536f2c1e63bdff6 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 15 Sep 2021 13:25:31 +0300 Subject: [PATCH 171/336] net/mlx5e: kTLS, Fix crash in RX resync flow For the TLS RX resync flow, we maintain a list of TLS contexts that require some attention, to communicate their resync information to the HW. Here we fix list corruptions, by protecting the entries against movements coming from resync_handle_seq_match(), until their resync handling in napi is fully completed. Fixes: e9ce991bce5b ("net/mlx5e: kTLS, Add resiliency to RX resync failures") Signed-off-by: Tariq Toukan Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/en_accel/ktls_rx.c | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 62abce008c7b..a2a9f68579dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -55,6 +55,7 @@ struct mlx5e_ktls_offload_context_rx { DECLARE_BITMAP(flags, MLX5E_NUM_PRIV_RX_FLAGS); /* resync */ + spinlock_t lock; /* protects resync fields */ struct mlx5e_ktls_rx_resync_ctx resync; struct list_head list; }; @@ -386,14 +387,18 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r struct mlx5e_icosq *sq; bool trigger_poll; - memcpy(info->rec_seq, &priv_rx->resync.sw_rcd_sn_be, sizeof(info->rec_seq)); - sq = &c->async_icosq; ktls_resync = sq->ktls_resync; + trigger_poll = false; spin_lock_bh(&ktls_resync->lock); - list_add_tail(&priv_rx->list, &ktls_resync->list); - trigger_poll = !test_and_set_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); + spin_lock_bh(&priv_rx->lock); + memcpy(info->rec_seq, &priv_rx->resync.sw_rcd_sn_be, sizeof(info->rec_seq)); + if (list_empty(&priv_rx->list)) { + list_add_tail(&priv_rx->list, &ktls_resync->list); + trigger_poll = !test_and_set_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); + } + spin_unlock_bh(&priv_rx->lock); spin_unlock_bh(&ktls_resync->lock); if (!trigger_poll) @@ -617,6 +622,8 @@ int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, if (err) goto err_create_key; + INIT_LIST_HEAD(&priv_rx->list); + spin_lock_init(&priv_rx->lock); priv_rx->crypto_info = *(struct tls12_crypto_info_aes_gcm_128 *)crypto_info; @@ -730,10 +737,14 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) priv_rx = list_first_entry(&local_list, struct mlx5e_ktls_offload_context_rx, list); + spin_lock(&priv_rx->lock); cseg = post_static_params(sq, priv_rx); - if (IS_ERR(cseg)) + if (IS_ERR(cseg)) { + spin_unlock(&priv_rx->lock); break; - list_del(&priv_rx->list); + } + list_del_init(&priv_rx->list); + spin_unlock(&priv_rx->lock); db_cseg = cseg; } if (db_cseg) From 362980eada85b5ea691e5e0d9257a991aa7ade47 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 21 Oct 2021 18:15:10 +0300 Subject: [PATCH 172/336] net/mlx5e: Wait for concurrent flow deletion during neigh/fib events Function mlx5e_take_tmp_flow() skips flows with zero reference count. This can cause syndrome 0x179e84 when the called from neigh or route update code and the skipped flow is not removed from the hardware by the time underlying encap/decap resource is deleted. Add new completion 'del_hw_done' that is completed when flow is unoffloaded. This is safe to do because flow with reference count zero needs to be detached from encap/decap entry before its memory is deallocated, which requires taking the encap_tbl_lock mutex that is held by the event handlers code. Fixes: 8914add2c9e5 ("net/mlx5e: Handle FIB events to update tunnel endpoint device") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 8 +++++++- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h index 8f64f2c8895a..b689701ac7d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h @@ -102,6 +102,7 @@ struct mlx5e_tc_flow { refcount_t refcnt; struct rcu_head rcu_head; struct completion init_done; + struct completion del_hw_done; int tunnel_id; /* the mapped tunnel id of this flow */ struct mlx5_flow_attr *attr; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c index 660cca73c36c..042b1abe1437 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c @@ -245,8 +245,14 @@ static void mlx5e_take_tmp_flow(struct mlx5e_tc_flow *flow, struct list_head *flow_list, int index) { - if (IS_ERR(mlx5e_flow_get(flow))) + if (IS_ERR(mlx5e_flow_get(flow))) { + /* Flow is being deleted concurrently. Wait for it to be + * unoffloaded from hardware, otherwise deleting encap will + * fail. + */ + wait_for_completion(&flow->del_hw_done); return; + } wait_for_completion(&flow->init_done); flow->tmp_entry_index = index; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 835caa1c7b74..cb76c41fe163 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1600,6 +1600,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv, else mlx5e_tc_unoffload_fdb_rules(esw, flow, attr); } + complete_all(&flow->del_hw_done); if (mlx5_flow_has_geneve_opt(flow)) mlx5_geneve_tlv_option_del(priv->mdev->geneve); @@ -4465,6 +4466,7 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size, INIT_LIST_HEAD(&flow->l3_to_l2_reformat); refcount_set(&flow->refcnt, 1); init_completion(&flow->init_done); + init_completion(&flow->del_hw_done); *__flow = flow; *__parse_attr = parse_attr; From d7751d6476185ff754b9dad2cba0c0a6e43ecadc Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 20 May 2021 17:09:58 +0300 Subject: [PATCH 173/336] net/mlx5: E-Switch, Fix resetting of encap mode when entering switchdev E-Switch encap mode is relevant only when in switchdev mode. The RDMA driver can query the encap configuration via mlx5_eswitch_get_encap_mode(). Make sure it returns the currently used mode and not the set one. This reverts the cited commit which reset the encap mode on entering switchdev and fixes the original issue properly. Fixes: 9a64144d683a ("net/mlx5: E-Switch, Fix default encap mode") Signed-off-by: Paul Blakey Reviewed-by: Mark Bloch Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 9 +++++++-- .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 7 ------- include/linux/mlx5/eswitch.h | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index ec136b499204..5872cc8bf953 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1572,6 +1572,11 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) esw->enabled_vports = 0; esw->mode = MLX5_ESWITCH_NONE; esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE; + if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) && + MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap)) + esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC; + else + esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; dev->priv.eswitch = esw; BLOCKING_INIT_NOTIFIER_HEAD(&esw->n_head); @@ -1934,7 +1939,7 @@ free_out: return err; } -u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev) +u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev) { struct mlx5_eswitch *esw = dev->priv.eswitch; @@ -1948,7 +1953,7 @@ mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev) struct mlx5_eswitch *esw; esw = dev->priv.eswitch; - return mlx5_esw_allowed(esw) ? esw->offloads.encap : + return (mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS) ? esw->offloads.encap : DEVLINK_ESWITCH_ENCAP_MODE_NONE; } EXPORT_SYMBOL(mlx5_eswitch_get_encap_mode); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index f4eaa5893886..80fa76f60e1e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3183,12 +3183,6 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) u64 mapping_id; int err; - if (MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, reformat) && - MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, decap)) - esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_BASIC; - else - esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; - mutex_init(&esw->offloads.termtbl_mutex); mlx5_rdma_enable_roce(esw->dev); @@ -3286,7 +3280,6 @@ void esw_offloads_disable(struct mlx5_eswitch *esw) esw_offloads_metadata_uninit(esw); mlx5_rdma_disable_roce(esw->dev); mutex_destroy(&esw->offloads.termtbl_mutex); - esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE; } static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode) diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 97afcea39a7b..8b18fe9771f9 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -145,13 +145,13 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, GENMASK(31 - ESW_TUN_ID_BITS - ESW_RESERVED_BITS, \ ESW_TUN_OPTS_OFFSET + 1) -u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); +u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); struct mlx5_core_dev *mlx5_eswitch_get_core_dev(struct mlx5_eswitch *esw); #else /* CONFIG_MLX5_ESWITCH */ -static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev) +static inline u8 mlx5_eswitch_mode(const struct mlx5_core_dev *dev) { return MLX5_ESWITCH_NONE; } From 76ded29d3fcda4928da8849ffc446ea46871c1c2 Mon Sep 17 00:00:00 2001 From: Valentine Fatiev Date: Tue, 26 Oct 2021 11:42:41 +0300 Subject: [PATCH 174/336] net/mlx5e: nullify cq->dbg pointer in mlx5_debug_cq_remove() Prior to this patch in case mlx5_core_destroy_cq() failed it proceeds to rest of destroy operations. mlx5_core_destroy_cq() could be called again by user and cause additional call of mlx5_debug_cq_remove(). cq->dbg was not nullify in previous call and cause the crash. Fix it by nullify cq->dbg pointer after removal. Also proceed to destroy operations only if FW return 0 for MLX5_CMD_OP_DESTROY_CQ command. general protection fault, probably for non-canonical address 0x2000300004058: 0000 [#1] SMP PTI CPU: 5 PID: 1228 Comm: python Not tainted 5.15.0-rc5_for_upstream_min_debug_2021_10_14_11_06 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:lockref_get+0x1/0x60 Code: 5d e9 53 ff ff ff 48 8d 7f 70 e8 0a 2e 48 00 c7 85 d0 00 00 00 02 00 00 00 c6 45 70 00 fb 5d c3 c3 cc cc cc cc cc cc cc cc 53 <48> 8b 17 48 89 fb 85 d2 75 3d 48 89 d0 bf 64 00 00 00 48 89 c1 48 RSP: 0018:ffff888137dd7a38 EFLAGS: 00010206 RAX: 0000000000000000 RBX: ffff888107d5f458 RCX: 00000000fffffffe RDX: 000000000002c2b0 RSI: ffffffff8155e2e0 RDI: 0002000300004058 RBP: ffff888137dd7a88 R08: 0002000300004058 R09: ffff8881144a9f88 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8881141d4000 R13: ffff888137dd7c68 R14: ffff888137dd7d58 R15: ffff888137dd7cc0 FS: 00007f4644f2a4c0(0000) GS:ffff8887a2d40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055b4500f4380 CR3: 0000000114f7a003 CR4: 0000000000170ea0 Call Trace: simple_recursive_removal+0x33/0x2e0 ? debugfs_remove+0x60/0x60 debugfs_remove+0x40/0x60 mlx5_debug_cq_remove+0x32/0x70 [mlx5_core] mlx5_core_destroy_cq+0x41/0x1d0 [mlx5_core] devx_obj_cleanup+0x151/0x330 [mlx5_ib] ? __pollwait+0xd0/0xd0 ? xas_load+0x5/0x70 ? xa_load+0x62/0xa0 destroy_hw_idr_uobject+0x20/0x80 [ib_uverbs] uverbs_destroy_uobject+0x3b/0x360 [ib_uverbs] uobj_destroy+0x54/0xa0 [ib_uverbs] ib_uverbs_cmd_verbs+0xaf2/0x1160 [ib_uverbs] ? uverbs_finalize_object+0xd0/0xd0 [ib_uverbs] ib_uverbs_ioctl+0xc4/0x1b0 [ib_uverbs] __x64_sys_ioctl+0x3e4/0x8e0 Fixes: 94b960b9deff ("net/mlx5e: Fix memory leak in mlx5_core_destroy_cq() error path") Signed-off-by: Valentine Fatiev Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 5 +++-- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 02e77ffe5c3e..5371ad0a12eb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -164,13 +164,14 @@ int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) MLX5_SET(destroy_cq_in, in, cqn, cq->cqn); MLX5_SET(destroy_cq_in, in, uid, cq->uid); err = mlx5_cmd_exec_in(dev, destroy_cq, in); + if (err) + return err; synchronize_irq(cq->irqn); - mlx5_cq_put(cq); wait_for_completion(&cq->free); - return err; + return 0; } EXPORT_SYMBOL(mlx5_core_destroy_cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 07c8d9811bc8..10d195042ab5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -507,6 +507,8 @@ void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) if (!mlx5_debugfs_root) return; - if (cq->dbg) + if (cq->dbg) { rem_res_tree(cq->dbg); + cq->dbg = NULL; + } } From 9091b821aaa4c2d107ca8f97c32baefcb1e7e40d Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 3 Nov 2021 01:09:04 +0200 Subject: [PATCH 175/336] net/mlx5: DR, Handle eswitch manager and uplink vports separately When querying eswitch manager vport capabilities as "other = 1", we encounter a FW compatibility issue with older FW versions. To maintain backward compatibility, eswitch manager vport should be queried as "other = 0" vport both for ECPF and non-ECPF cases. This patch fixes these queries and improves the code readability by handling eswitch manager and uplink vports separately, avoiding the excessive 'if' conditions. Also, uplink caps are stored similar to esw manager and not as part of xarray. Fixes: dd4acb2a0954 ("net/mlx5: DR, Add missing query for vport 0") Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/steering/dr_domain.c | 56 ++++++++----------- .../mellanox/mlx5/core/steering/dr_types.h | 1 + 2 files changed, 24 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c index 49089cbe897c..8cbd36c82b3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c @@ -135,25 +135,14 @@ static void dr_domain_fill_uplink_caps(struct mlx5dr_domain *dmn, static int dr_domain_query_vport(struct mlx5dr_domain *dmn, u16 vport_number, + bool other_vport, struct mlx5dr_cmd_vport_cap *vport_caps) { - u16 cmd_vport = vport_number; - bool other_vport = true; int ret; - if (vport_number == MLX5_VPORT_UPLINK) { - dr_domain_fill_uplink_caps(dmn, vport_caps); - return 0; - } - - if (dmn->info.caps.is_ecpf && vport_number == MLX5_VPORT_ECPF) { - other_vport = false; - cmd_vport = 0; - } - ret = mlx5dr_cmd_query_esw_vport_context(dmn->mdev, other_vport, - cmd_vport, + vport_number, &vport_caps->icm_address_rx, &vport_caps->icm_address_tx); if (ret) @@ -161,7 +150,7 @@ static int dr_domain_query_vport(struct mlx5dr_domain *dmn, ret = mlx5dr_cmd_query_gvmi(dmn->mdev, other_vport, - cmd_vport, + vport_number, &vport_caps->vport_gvmi); if (ret) return ret; @@ -176,9 +165,15 @@ static int dr_domain_query_esw_mngr(struct mlx5dr_domain *dmn) { return dr_domain_query_vport(dmn, dmn->info.caps.is_ecpf ? MLX5_VPORT_ECPF : 0, + false, &dmn->info.caps.vports.esw_manager_caps); } +static void dr_domain_query_uplink(struct mlx5dr_domain *dmn) +{ + dr_domain_fill_uplink_caps(dmn, &dmn->info.caps.vports.uplink_caps); +} + static struct mlx5dr_cmd_vport_cap * dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport) { @@ -190,7 +185,7 @@ dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport) if (!vport_caps) return NULL; - ret = dr_domain_query_vport(dmn, vport, vport_caps); + ret = dr_domain_query_vport(dmn, vport, true, vport_caps); if (ret) { kvfree(vport_caps); return NULL; @@ -207,16 +202,26 @@ dr_domain_add_vport_cap(struct mlx5dr_domain *dmn, u16 vport) return vport_caps; } +static bool dr_domain_is_esw_mgr_vport(struct mlx5dr_domain *dmn, u16 vport) +{ + struct mlx5dr_cmd_caps *caps = &dmn->info.caps; + + return (caps->is_ecpf && vport == MLX5_VPORT_ECPF) || + (!caps->is_ecpf && vport == 0); +} + struct mlx5dr_cmd_vport_cap * mlx5dr_domain_get_vport_cap(struct mlx5dr_domain *dmn, u16 vport) { struct mlx5dr_cmd_caps *caps = &dmn->info.caps; struct mlx5dr_cmd_vport_cap *vport_caps; - if ((caps->is_ecpf && vport == MLX5_VPORT_ECPF) || - (!caps->is_ecpf && vport == 0)) + if (dr_domain_is_esw_mgr_vport(dmn, vport)) return &caps->vports.esw_manager_caps; + if (vport == MLX5_VPORT_UPLINK) + return &caps->vports.uplink_caps; + vport_load: vport_caps = xa_load(&caps->vports.vports_caps_xa, vport); if (vport_caps) @@ -241,17 +246,6 @@ static void dr_domain_clear_vports(struct mlx5dr_domain *dmn) } } -static int dr_domain_query_uplink(struct mlx5dr_domain *dmn) -{ - struct mlx5dr_cmd_vport_cap *vport_caps; - - vport_caps = mlx5dr_domain_get_vport_cap(dmn, MLX5_VPORT_UPLINK); - if (!vport_caps) - return -EINVAL; - - return 0; -} - static int dr_domain_query_fdb_caps(struct mlx5_core_dev *mdev, struct mlx5dr_domain *dmn) { @@ -281,11 +275,7 @@ static int dr_domain_query_fdb_caps(struct mlx5_core_dev *mdev, goto free_vports_caps_xa; } - ret = dr_domain_query_uplink(dmn); - if (ret) { - mlx5dr_err(dmn, "Failed to query uplink vport caps (err: %d)", ret); - goto free_vports_caps_xa; - } + dr_domain_query_uplink(dmn); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 3028b776da00..2333c2439c28 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -764,6 +764,7 @@ struct mlx5dr_roce_cap { struct mlx5dr_vports { struct mlx5dr_cmd_vport_cap esw_manager_caps; + struct mlx5dr_cmd_vport_cap uplink_caps; struct xarray vports_caps_xa; }; From 455832d49666e1765acf812be79710b9f84a8cbf Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 3 Nov 2021 17:51:03 +0200 Subject: [PATCH 176/336] net/mlx5: DR, Fix check for unsupported fields in match param The existing loop doesn't cast the buffer while scanning it, which results in out-of-bounds read and failure to create the matcher. Fixes: 941f19798a11 ("net/mlx5: DR, Add check for unsupported fields in match param") Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/steering/dr_matcher.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index 75c775bee351..793365242e85 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -924,11 +924,12 @@ static int dr_matcher_init(struct mlx5dr_matcher *matcher, /* Check that all mask data was consumed */ for (i = 0; i < consumed_mask.match_sz; i++) { - if (consumed_mask.match_buf[i]) { - mlx5dr_dbg(dmn, "Match param mask contains unsupported parameters\n"); - ret = -EOPNOTSUPP; - goto free_consumed_mask; - } + if (!((u8 *)consumed_mask.match_buf)[i]) + continue; + + mlx5dr_dbg(dmn, "Match param mask contains unsupported parameters\n"); + ret = -EOPNOTSUPP; + goto free_consumed_mask; } ret = 0; From ba50cd9451f6c49cf0841c0a4a146ff6a2822699 Mon Sep 17 00:00:00 2001 From: Neta Ostrovsky Date: Wed, 27 Oct 2021 15:16:14 +0300 Subject: [PATCH 177/336] net/mlx5: Update error handler for UCTX and UMEM In the fast unload flow, the device state is set to internal error, which indicates that the driver started the destroy process. In this case, when a destroy command is being executed, it should return MLX5_CMD_STAT_OK. Fix MLX5_CMD_OP_DESTROY_UCTX and MLX5_CMD_OP_DESTROY_UMEM to return OK instead of EIO. This fixes a call trace in the umem release process - [ 2633.536695] Call Trace: [ 2633.537518] ib_uverbs_remove_one+0xc3/0x140 [ib_uverbs] [ 2633.538596] remove_client_context+0x8b/0xd0 [ib_core] [ 2633.539641] disable_device+0x8c/0x130 [ib_core] [ 2633.540615] __ib_unregister_device+0x35/0xa0 [ib_core] [ 2633.541640] ib_unregister_device+0x21/0x30 [ib_core] [ 2633.542663] __mlx5_ib_remove+0x38/0x90 [mlx5_ib] [ 2633.543640] auxiliary_bus_remove+0x1e/0x30 [auxiliary] [ 2633.544661] device_release_driver_internal+0x103/0x1f0 [ 2633.545679] bus_remove_device+0xf7/0x170 [ 2633.546640] device_del+0x181/0x410 [ 2633.547606] mlx5_rescan_drivers_locked.part.10+0x63/0x160 [mlx5_core] [ 2633.548777] mlx5_unregister_device+0x27/0x40 [mlx5_core] [ 2633.549841] mlx5_uninit_one+0x21/0xc0 [mlx5_core] [ 2633.550864] remove_one+0x69/0xe0 [mlx5_core] [ 2633.551819] pci_device_remove+0x3b/0xc0 [ 2633.552731] device_release_driver_internal+0x103/0x1f0 [ 2633.553746] unbind_store+0xf6/0x130 [ 2633.554657] kernfs_fop_write+0x116/0x190 [ 2633.555567] vfs_write+0xa5/0x1a0 [ 2633.556407] ksys_write+0x4f/0xb0 [ 2633.557233] do_syscall_64+0x5b/0x1a0 [ 2633.558071] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 2633.559018] RIP: 0033:0x7f9977132648 [ 2633.559821] Code: 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 55 6f 2d 00 8b 00 85 c0 75 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 49 89 d4 55 [ 2633.562332] RSP: 002b:00007fffb1a83888 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 2633.563472] RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f9977132648 [ 2633.564541] RDX: 000000000000000c RSI: 000055b90546e230 RDI: 0000000000000001 [ 2633.565596] RBP: 000055b90546e230 R08: 00007f9977406860 R09: 00007f9977a54740 [ 2633.566653] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f99774056e0 [ 2633.567692] R13: 000000000000000c R14: 00007f9977400880 R15: 000000000000000c [ 2633.568725] ---[ end trace 10b4fe52945e544d ]--- Fixes: 6a6fabbfa3e8 ("net/mlx5: Update pci error handler entries and command translation") Signed-off-by: Neta Ostrovsky Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index f71ec4d9d68e..8eaa24d865c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -339,6 +339,8 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_PAGE_FAULT_RESUME: case MLX5_CMD_OP_QUERY_ESW_FUNCTIONS: case MLX5_CMD_OP_DEALLOC_SF: + case MLX5_CMD_OP_DESTROY_UCTX: + case MLX5_CMD_OP_DESTROY_UMEM: return MLX5_CMD_STAT_OK; case MLX5_CMD_OP_QUERY_HCA_CAP: @@ -464,9 +466,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: case MLX5_CMD_OP_CREATE_UCTX: - case MLX5_CMD_OP_DESTROY_UCTX: case MLX5_CMD_OP_CREATE_UMEM: - case MLX5_CMD_OP_DESTROY_UMEM: case MLX5_CMD_OP_ALLOC_MEMIC: case MLX5_CMD_OP_MODIFY_XRQ: case MLX5_CMD_OP_RELEASE_XRQ_ERROR: From 2eb0cb31bc4ce2ede5460cf3ef433b40cf5f040d Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Nov 2021 15:19:12 +0000 Subject: [PATCH 178/336] net/mlx5: E-Switch, rebuild lag only when needed A user can enable VFs without changing E-Switch mode, this can happen when a user moves straight to switchdev mode and only once in switchdev VFs are enabled via the sysfs interface. The cited commit assumed this isn't possible and exposed a single API function where the E-switch calls into the lag code, breaks the lag and prevents any other lag operations to take place until the E-switch update has ended. Breaking the hardware lag when it isn't needed can make it such that hardware lag can't be enabled again. In the sysfs call path check if the current E-Switch mode is NONE, in the context of the function it can only mean the E-Switch is moving out of NONE mode and the hardware lag should be disabled and enabled once the mode change has ended. If the mode isn't NONE it means VFs are about to be enabled and such operation doesn't require toggling the hardware lag. Fixes: cac1eb2cf2e3 ("net/mlx5: Lag, properly lock eswitch if needed") Signed-off-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 5872cc8bf953..51a8cecc4a7c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1305,12 +1305,17 @@ abort: */ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) { + bool toggle_lag; int ret; if (!mlx5_esw_allowed(esw)) return 0; - mlx5_lag_disable_change(esw->dev); + toggle_lag = esw->mode == MLX5_ESWITCH_NONE; + + if (toggle_lag) + mlx5_lag_disable_change(esw->dev); + down_write(&esw->mode_lock); if (esw->mode == MLX5_ESWITCH_NONE) { ret = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY, num_vfs); @@ -1324,7 +1329,10 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) esw->esw_funcs.num_vfs = num_vfs; } up_write(&esw->mode_lock); - mlx5_lag_enable_change(esw->dev); + + if (toggle_lag) + mlx5_lag_enable_change(esw->dev); + return ret; } From 38a54cae6f76c3e6a1e6c1e52c2e43a069fa78cb Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Wed, 3 Nov 2021 13:04:23 +0200 Subject: [PATCH 179/336] net/mlx5: Fix flow counters SF bulk query len When doing a flow counters bulk query, the number of counters to query must be aligned to 4. Current SF bulk query len is not aligned to 4, which leads to an error when trying to query more than 4 counters. Fix it by aligning SF bulk query len to 4. Fixes: 2fdeb4f4c2ae ("net/mlx5: Reduce flow counters bulk query buffer size for SFs") Signed-off-by: Avihai Horon Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 31c99d53faf7..7e0e04cf26f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -40,7 +40,7 @@ #define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000) /* Max number of counters to query in bulk read is 32K */ #define MLX5_SW_MAX_COUNTERS_BULK BIT(15) -#define MLX5_SF_NUM_COUNTERS_BULK 6 +#define MLX5_SF_NUM_COUNTERS_BULK 8 #define MLX5_FC_POOL_MAX_THRESHOLD BIT(18) #define MLX5_FC_POOL_USED_BUFF_RATIO 10 From 806401c20a0f9c51b6c8fd7035671e6ca841f6c2 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Mon, 8 Nov 2021 16:41:05 +0200 Subject: [PATCH 180/336] net/mlx5e: CT, Fix multiple allocations and memleak of mod acts CT clear action offload adds additional mod hdr actions to the flow's original mod actions in order to clear the registers which hold ct_state. When such flow also includes encap action, a neigh update event can cause the driver to unoffload the flow and then reoffload it. Each time this happens, the ct clear handling adds that same set of mod hdr actions to reset ct_state until the max of mod hdr actions is reached. Also the driver never releases the allocated mod hdr actions and causing a memleak. Fix above two issues by moving CT clear mod acts allocation into the parsing actions phase and only use it when offloading the rule. The release of mod acts will be done in the normal flow_put(). backtrace: [<000000007316e2f3>] krealloc+0x83/0xd0 [<00000000ef157de1>] mlx5e_mod_hdr_alloc+0x147/0x300 [mlx5_core] [<00000000970ce4ae>] mlx5e_tc_match_to_reg_set_and_get_id+0xd7/0x240 [mlx5_core] [<0000000067c5fa17>] mlx5e_tc_match_to_reg_set+0xa/0x20 [mlx5_core] [<00000000d032eb98>] mlx5_tc_ct_entry_set_registers.isra.0+0x36/0xc0 [mlx5_core] [<00000000fd23b869>] mlx5_tc_ct_flow_offload+0x272/0x1f10 [mlx5_core] [<000000004fc24acc>] mlx5e_tc_offload_fdb_rules.part.0+0x150/0x620 [mlx5_core] [<00000000dc741c17>] mlx5e_tc_encap_flows_add+0x489/0x690 [mlx5_core] [<00000000e92e49d7>] mlx5e_rep_update_flows+0x6e4/0x9b0 [mlx5_core] [<00000000f60f5602>] mlx5e_rep_neigh_update+0x39a/0x5d0 [mlx5_core] Fixes: 1ef3018f5af3 ("net/mlx5e: CT: Support clear action") Signed-off-by: Roi Dayan Reviewed-by: Paul Blakey Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/tc_ct.c | 26 ++++++++++++------- .../ethernet/mellanox/mlx5/core/en/tc_ct.h | 2 ++ .../net/ethernet/mellanox/mlx5/core/en_tc.c | 8 ++++-- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index c1c6e74c79c4..2445e2ae3324 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -1356,9 +1356,13 @@ mlx5_tc_ct_match_add(struct mlx5_tc_ct_priv *priv, int mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_acts, const struct flow_action_entry *act, struct netlink_ext_ack *extack) { + bool clear_action = act->ct.action & TCA_CT_ACT_CLEAR; + int err; + if (!priv) { NL_SET_ERR_MSG_MOD(extack, "offload of ct action isn't available"); @@ -1369,6 +1373,17 @@ mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, attr->ct_attr.ct_action = act->ct.action; attr->ct_attr.nf_ft = act->ct.flow_table; + if (!clear_action) + goto out; + + err = mlx5_tc_ct_entry_set_registers(priv, mod_acts, 0, 0, 0, 0); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to set registers for ct clear"); + return err; + } + attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + +out: return 0; } @@ -1898,23 +1913,16 @@ __mlx5_tc_ct_flow_offload_clear(struct mlx5_tc_ct_priv *ct_priv, memcpy(pre_ct_attr, attr, attr_sz); - err = mlx5_tc_ct_entry_set_registers(ct_priv, mod_acts, 0, 0, 0, 0); - if (err) { - ct_dbg("Failed to set register for ct clear"); - goto err_set_registers; - } - mod_hdr = mlx5_modify_header_alloc(priv->mdev, ct_priv->ns_type, mod_acts->num_actions, mod_acts->actions); if (IS_ERR(mod_hdr)) { err = PTR_ERR(mod_hdr); ct_dbg("Failed to add create ct clear mod hdr"); - goto err_set_registers; + goto err_mod_hdr; } pre_ct_attr->modify_hdr = mod_hdr; - pre_ct_attr->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; rule = mlx5_tc_rule_insert(priv, orig_spec, pre_ct_attr); if (IS_ERR(rule)) { @@ -1930,7 +1938,7 @@ __mlx5_tc_ct_flow_offload_clear(struct mlx5_tc_ct_priv *ct_priv, err_insert: mlx5_modify_header_dealloc(priv->mdev, mod_hdr); -err_set_registers: +err_mod_hdr: netdev_warn(priv->netdev, "Failed to offload ct clear flow, err %d\n", err); kfree(pre_ct_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h index 363329f4aac6..99662af1e41a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.h @@ -110,6 +110,7 @@ int mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec); int mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_acts, const struct flow_action_entry *act, struct netlink_ext_ack *extack); @@ -172,6 +173,7 @@ mlx5_tc_ct_add_no_trk_match(struct mlx5_flow_spec *spec) static inline int mlx5_tc_ct_parse_action(struct mlx5_tc_ct_priv *priv, struct mlx5_flow_attr *attr, + struct mlx5e_tc_mod_hdr_acts *mod_acts, const struct flow_action_entry *act, struct netlink_ext_ack *extack) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index cb76c41fe163..3d45f4ae80c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -3608,7 +3608,9 @@ parse_tc_nic_actions(struct mlx5e_priv *priv, attr->dest_chain = act->chain_index; break; case FLOW_ACTION_CT: - err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr, act, extack); + err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr, + &parse_attr->mod_hdr_acts, + act, extack); if (err) return err; @@ -4277,7 +4279,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, NL_SET_ERR_MSG_MOD(extack, "Sample action with connection tracking is not supported"); return -EOPNOTSUPP; } - err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr, act, extack); + err = mlx5_tc_ct_parse_action(get_ct_priv(priv), attr, + &parse_attr->mod_hdr_acts, + act, extack); if (err) return err; From ae396d85c01c7bdc9eeceecde1f493d03f793465 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Fri, 5 Nov 2021 11:19:48 +0200 Subject: [PATCH 181/336] net/mlx5: Lag, update tracker when state change event received Currently, In NETDEV_CHANGELOWERSTATE/NETDEV_CHANGEUPPERSTATE events handling, tracking is not fully completed if the LAG device is not ready at the time the events occur. But, we must keep track of the upper and lower states after receiving the events because RoCE needs this info in mlx5_lag_get_roce_netdev() - in order to return the corresponding port that its running on. Returning the wrong (not most recent) port will lead to gids table being incorrect. For example: If during the attachment of a slave to the bond, the other non-attached port performs pci_reload, then the LAG device is not ready, but that should not result in dismissing attached slave tracker update automatically (which is performed in mlx5_handle_changelowerstate()), Since these events might not come later, which can lead to both bond ports having tx_enabled=0 - which is not a valid state of LAG bond. Fixes: 9b412cc35f00 ("net/mlx5e: Add LAG warning if bond slave is not lag master") Signed-off-by: Maher Sanalla Reviewed-by: Mark Bloch Reviewed-by: Jianbo Liu Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 48d2ea690d7a..4ddf6b330a44 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -615,6 +615,7 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, bool is_bonded, is_in_lag, mode_supported; int bond_status = 0; int num_slaves = 0; + int changed = 0; int idx; if (!netif_is_lag_master(upper)) @@ -653,27 +654,27 @@ static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev, */ is_in_lag = num_slaves == MLX5_MAX_PORTS && bond_status == 0x3; - if (!mlx5_lag_is_ready(ldev) && is_in_lag) { - NL_SET_ERR_MSG_MOD(info->info.extack, - "Can't activate LAG offload, PF is configured with more than 64 VFs"); - return 0; - } - /* Lag mode must be activebackup or hash. */ mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP || tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH; - if (is_in_lag && !mode_supported) - NL_SET_ERR_MSG_MOD(info->info.extack, - "Can't activate LAG offload, TX type isn't supported"); - is_bonded = is_in_lag && mode_supported; if (tracker->is_bonded != is_bonded) { tracker->is_bonded = is_bonded; - return 1; + changed = 1; } - return 0; + if (!is_in_lag) + return changed; + + if (!mlx5_lag_is_ready(ldev)) + NL_SET_ERR_MSG_MOD(info->info.extack, + "Can't activate LAG offload, PF is configured with more than 64 VFs"); + else if (!mode_supported) + NL_SET_ERR_MSG_MOD(info->info.extack, + "Can't activate LAG offload, TX type isn't supported"); + + return changed; } static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev, @@ -716,9 +717,6 @@ static int mlx5_lag_netdev_event(struct notifier_block *this, ldev = container_of(this, struct mlx5_lag, nb); - if (!mlx5_lag_is_ready(ldev) && event == NETDEV_CHANGELOWERSTATE) - return NOTIFY_DONE; - tracker = ldev->tracker; switch (event) { From c4c3176739dfa6efcc5b1d1de4b3fd2b51b048c7 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Mon, 1 Nov 2021 16:18:53 +0200 Subject: [PATCH 182/336] net/mlx5: E-Switch, return error if encap isn't supported On regular ConnectX HCAs getting encap mode isn't supported when the E-Switch is in NONE mode. Current code would return no error code when trying to get encap mode in such case which is wrong. Fix by returning error value to indicate failure to caller in such case. Fixes: 8e0aa4bc959c ("net/mlx5: E-switch, Protect eswitch mode changes") Signed-off-by: Raed Salem Reviewed-by: Mark Bloch Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 80fa76f60e1e..a46455694f7a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3623,7 +3623,7 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, *encap = esw->offloads.encap; unlock: up_write(&esw->mode_lock); - return 0; + return err; } static bool From 23ef63d5e14f916c5bba39128ebef395859d7c0f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 15 Nov 2021 12:37:46 +0900 Subject: [PATCH 183/336] ata: libata: improve ata_read_log_page() error message If ata_read_log_page() fails to read a log page, the ata_dev_err() error message only print the page number, omitting the log number. In case of error, facilitate debugging by also printing the log number. Cc: stable@kernel.org # 5.15 Signed-off-by: Damien Le Moal Tested-by: Matthew Perkowski --- drivers/ata/libata-core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 8a0ccb190d76..edaedcd82630 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2031,8 +2031,9 @@ retry: dev->horkage |= ATA_HORKAGE_NO_DMA_LOG; goto retry; } - ata_dev_err(dev, "Read log page 0x%02x failed, Emask 0x%x\n", - (unsigned int)page, err_mask); + ata_dev_err(dev, + "Read log 0x%02x page 0x%02x failed, Emask 0x%x\n", + (unsigned int)log, (unsigned int)page, err_mask); } return err_mask; From d1faacbf67b1944f0e0c618dc581d929263f6fe9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 16 Nov 2021 10:15:59 -0800 Subject: [PATCH 184/336] Revert "mark pstore-blk as broken" This reverts commit d07f3b081ee632268786601f55e1334d1f68b997. pstore-blk was fixed to avoid the unwanted APIs in commit 7bb9557b48fc ("pstore/blk: Use the normal block device I/O path"), which landed in the same release as the commit adding BROKEN. Cc: Jens Axboe Cc: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20211116181559.3975566-1-keescook@chromium.org Signed-off-by: Jens Axboe --- fs/pstore/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 328da35da390..8adabde685f1 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -173,7 +173,6 @@ config PSTORE_BLK tristate "Log panic/oops to a block device" depends on PSTORE depends on BLOCK - depends on BROKEN select PSTORE_ZONE default n help From 3ff1f6b6ba6f97f50862aa50e79959cc8ddc2566 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 4 Nov 2021 11:10:53 -0700 Subject: [PATCH 185/336] scsi: ufs: core: Improve SCSI abort handling The following has been observed on a test setup: WARNING: CPU: 4 PID: 250 at drivers/scsi/ufs/ufshcd.c:2737 ufshcd_queuecommand+0x468/0x65c Call trace: ufshcd_queuecommand+0x468/0x65c scsi_send_eh_cmnd+0x224/0x6a0 scsi_eh_test_devices+0x248/0x418 scsi_eh_ready_devs+0xc34/0xe58 scsi_error_handler+0x204/0x80c kthread+0x150/0x1b4 ret_from_fork+0x10/0x30 That warning is triggered by the following statement: WARN_ON(lrbp->cmd); Fix this warning by clearing lrbp->cmd from the abort handler. Link: https://lore.kernel.org/r/20211104181059.4129537-1-bvanassche@acm.org Fixes: 7a3e97b0dc4b ("[SCSI] ufshcd: UFS Host controller driver") Reviewed-by: Bean Huo Reviewed-by: Stanley Chu Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index afd38142b1c0..c32bd78820af 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7116,6 +7116,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd) goto release; } + lrbp->cmd = NULL; err = SUCCESS; release: From a0c2f8b6709a9a4af175497ca65f93804f57b248 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 5 Nov 2021 17:10:47 -0500 Subject: [PATCH 186/336] scsi: iscsi: Unblock session then wake up error handler We can race where iscsi_session_recovery_timedout() has woken up the error handler thread and it's now setting the devices to offline, and session_recovery_timedout()'s call to scsi_target_unblock() is also trying to set the device's state to transport-offline. We can then get a mix of states. For the case where we can't relogin we want the devices to be in transport-offline so when we have repaired the connection __iscsi_unblock_session() can set the state back to running. Set the device state then call into libiscsi to wake up the error handler. Link: https://lore.kernel.org/r/20211105221048.6541-2-michael.christie@oracle.com Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_iscsi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 78343d3f9385..554b6f784223 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1899,12 +1899,12 @@ static void session_recovery_timedout(struct work_struct *work) } spin_unlock_irqrestore(&session->lock, flags); - if (session->transport->session_recovery_timedout) - session->transport->session_recovery_timedout(session); - ISCSI_DBG_TRANS_SESSION(session, "Unblocking SCSI target\n"); scsi_target_unblock(&session->dev, SDEV_TRANSPORT_OFFLINE); ISCSI_DBG_TRANS_SESSION(session, "Completed unblocking SCSI target\n"); + + if (session->transport->session_recovery_timedout) + session->transport->session_recovery_timedout(session); } static void __iscsi_unblock_session(struct work_struct *work) From 4edd8cd4e86dd3047e5294bbefcc0a08f66a430f Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 5 Nov 2021 17:10:48 -0500 Subject: [PATCH 187/336] scsi: core: sysfs: Fix hang when device state is set via sysfs This fixes a regression added with: commit f0f82e2476f6 ("scsi: core: Fix capacity set to zero after offlinining device") The problem is that after iSCSI recovery, iscsid will call into the kernel to set the dev's state to running, and with that patch we now call scsi_rescan_device() with the state_mutex held. If the SCSI error handler thread is just starting to test the device in scsi_send_eh_cmnd() then it's going to try to grab the state_mutex. We are then stuck, because when scsi_rescan_device() tries to send its I/O scsi_queue_rq() calls -> scsi_host_queue_ready() -> scsi_host_in_recovery() which will return true (the host state is still in recovery) and I/O will just be requeued. scsi_send_eh_cmnd() will then never be able to grab the state_mutex to finish error handling. To prevent the deadlock move the rescan-related code to after we drop the state_mutex. This also adds a check for if we are already in the running state. This prevents extra scans and helps the iscsid case where if the transport class has already onlined the device during its recovery process then we don't need userspace to do it again plus possibly block that daemon. Link: https://lore.kernel.org/r/20211105221048.6541-3-michael.christie@oracle.com Fixes: f0f82e2476f6 ("scsi: core: Fix capacity set to zero after offlinining device") Cc: Bart Van Assche Cc: lijinlin Cc: Wu Bo Reviewed-by: Lee Duncan Reviewed-by: Wu Bo Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_sysfs.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 55addd78fde4..7afcec250f9b 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -792,6 +792,7 @@ store_state_field(struct device *dev, struct device_attribute *attr, int i, ret; struct scsi_device *sdev = to_scsi_device(dev); enum scsi_device_state state = 0; + bool rescan_dev = false; for (i = 0; i < ARRAY_SIZE(sdev_states); i++) { const int len = strlen(sdev_states[i].name); @@ -810,20 +811,27 @@ store_state_field(struct device *dev, struct device_attribute *attr, } mutex_lock(&sdev->state_mutex); - ret = scsi_device_set_state(sdev, state); - /* - * If the device state changes to SDEV_RUNNING, we need to - * run the queue to avoid I/O hang, and rescan the device - * to revalidate it. Running the queue first is necessary - * because another thread may be waiting inside - * blk_mq_freeze_queue_wait() and because that call may be - * waiting for pending I/O to finish. - */ - if (ret == 0 && state == SDEV_RUNNING) { + if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { + ret = count; + } else { + ret = scsi_device_set_state(sdev, state); + if (ret == 0 && state == SDEV_RUNNING) + rescan_dev = true; + } + mutex_unlock(&sdev->state_mutex); + + if (rescan_dev) { + /* + * If the device state changes to SDEV_RUNNING, we need to + * run the queue to avoid I/O hang, and rescan the device + * to revalidate it. Running the queue first is necessary + * because another thread may be waiting inside + * blk_mq_freeze_queue_wait() and because that call may be + * waiting for pending I/O to finish. + */ blk_mq_run_hw_queues(sdev->request_queue, true); scsi_rescan_device(dev); } - mutex_unlock(&sdev->state_mutex); return ret == 0 ? count : -EINVAL; } From 886fe2915cce6658b0fc19e64b82879325de61ea Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 8 Nov 2021 08:48:14 +0200 Subject: [PATCH 188/336] scsi: ufs: core: Fix task management completion timeout race __ufshcd_issue_tm_cmd() clears req->end_io_data after timing out, which races with the completion function ufshcd_tmc_handler() which expects req->end_io_data to have a value. Note __ufshcd_issue_tm_cmd() and ufshcd_tmc_handler() are already synchronized using hba->tmf_rqs and hba->outstanding_tasks under the host_lock spinlock. It is also not necessary (nor typical) to clear req->end_io_data because the block layer does it before allocating out requests e.g. via blk_get_request(). So fix by not clearing it. Link: https://lore.kernel.org/r/20211108064815.569494-2-adrian.hunter@intel.com Fixes: f5ef336fd2e4 ("scsi: ufs: core: Fix task management completion") Reviewed-by: Bart Van Assche Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index c32bd78820af..c18aa9a43f0a 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -6616,11 +6616,6 @@ static int __ufshcd_issue_tm_cmd(struct ufs_hba *hba, err = wait_for_completion_io_timeout(&wait, msecs_to_jiffies(TM_CMD_TIMEOUT)); if (!err) { - /* - * Make sure that ufshcd_compl_tm() does not trigger a - * use-after-free. - */ - req->end_io_data = NULL; ufshcd_add_tm_upiu_trace(hba, task_tag, UFS_TM_ERR); dev_err(hba->dev, "%s: task management cmd 0x%.2x timed-out\n", __func__, tm_function); From 5cb37a26355d79ab290220677b1b57d28e99a895 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 8 Nov 2021 08:48:15 +0200 Subject: [PATCH 189/336] scsi: ufs: core: Fix another task management completion race hba->outstanding_tasks, which is read under host_lock spinlock, tells the interrupt handler what task management tags are in use by the driver. The doorbell register bits indicate which tags are in use by the hardware. A doorbell bit that is 0 is because the bit has yet to be set by the driver, or because the task is complete. It is only possible to disambiguate the 2 cases, if reading/writing the doorbell register is synchronized with reading/writing hba->outstanding_tasks. For that reason, reading REG_UTP_TASK_REQ_DOOR_BELL must be done under spinlock. Link: https://lore.kernel.org/r/20211108064815.569494-3-adrian.hunter@intel.com Fixes: f5ef336fd2e4 ("scsi: ufs: core: Fix task management completion") Reviewed-by: Bart Van Assche Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index c18aa9a43f0a..13c09dbd99b9 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -6453,9 +6453,8 @@ static irqreturn_t ufshcd_tmc_handler(struct ufs_hba *hba) irqreturn_t ret = IRQ_NONE; int tag; - pending = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL); - spin_lock_irqsave(hba->host->host_lock, flags); + pending = ufshcd_readl(hba, REG_UTP_TASK_REQ_DOOR_BELL); issued = hba->outstanding_tasks & ~pending; for_each_set_bit(tag, &issued, hba->nutmrs) { struct request *req = hba->tmf_rqs[tag]; From 392006871bb26166bcfafa56faf49431c2cfaaa8 Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Mon, 8 Nov 2021 13:30:12 -0500 Subject: [PATCH 190/336] scsi: qla2xxx: Fix mailbox direction flags in qla2xxx_get_adapter_id() The SCM changes set the flags in mcp->out_mb instead of mcp->in_mb so the data was not actually being read into the mcp->mb[] array from the adapter. Link: https://lore.kernel.org/r/20211108183012.13895-1-emilne@redhat.com Fixes: 9f2475fe7406 ("scsi: qla2xxx: SAN congestion management implementation") Cc: stable@vger.kernel.org Reviewed-by: Himanshu Madhani Reviewed-by: Arun Easi Signed-off-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_mbx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c index 73a353153d33..10d2655ef676 100644 --- a/drivers/scsi/qla2xxx/qla_mbx.c +++ b/drivers/scsi/qla2xxx/qla_mbx.c @@ -1695,10 +1695,8 @@ qla2x00_get_adapter_id(scsi_qla_host_t *vha, uint16_t *id, uint8_t *al_pa, mcp->in_mb |= MBX_13|MBX_12|MBX_11|MBX_10; if (IS_FWI2_CAPABLE(vha->hw)) mcp->in_mb |= MBX_19|MBX_18|MBX_17|MBX_16; - if (IS_QLA27XX(vha->hw) || IS_QLA28XX(vha->hw)) { - mcp->in_mb |= MBX_15; - mcp->out_mb |= MBX_7|MBX_21|MBX_22|MBX_23; - } + if (IS_QLA27XX(vha->hw) || IS_QLA28XX(vha->hw)) + mcp->in_mb |= MBX_15|MBX_21|MBX_22|MBX_23; mcp->tov = MBX_TOV_SECONDS; mcp->flags = 0; From 2460386bef0b9b98b71728d3c173e15558b78d82 Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Mon, 15 Nov 2021 16:30:24 +0100 Subject: [PATCH 191/336] net: mvmdio: fix compilation warning The kernel test robot reported a following issue: >> drivers/net/ethernet/marvell/mvmdio.c:426:36: warning: unused variable 'orion_mdio_acpi_match' [-Wunused-const-variable] static const struct acpi_device_id orion_mdio_acpi_match[] = { ^ 1 warning generated. Fix that by surrounding the variable by appropriate ifdef. Fixes: c54da4c1acb1 ("net: mvmdio: add ACPI support") Reported-by: kernel test robot Signed-off-by: Marcin Wojtas Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20211115153024.209083-1-mw@semihalf.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvmdio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c index 62a97c46fba0..ef878973b859 100644 --- a/drivers/net/ethernet/marvell/mvmdio.c +++ b/drivers/net/ethernet/marvell/mvmdio.c @@ -429,12 +429,14 @@ static const struct of_device_id orion_mdio_match[] = { }; MODULE_DEVICE_TABLE(of, orion_mdio_match); +#ifdef CONFIG_ACPI static const struct acpi_device_id orion_mdio_acpi_match[] = { { "MRVL0100", BUS_TYPE_SMI }, { "MRVL0101", BUS_TYPE_XSMI }, { }, }; MODULE_DEVICE_TABLE(acpi, orion_mdio_acpi_match); +#endif static struct platform_driver orion_mdio_driver = { .probe = orion_mdio_probe, From 9f5363916a5099e618e6e40606e91b8ce0833754 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 16 Nov 2021 14:26:10 -0500 Subject: [PATCH 192/336] bnxt_en: Fix compile error regression when CONFIG_BNXT_SRIOV is not set bp->sriov_cfg is not defined when CONFIG_BNXT_SRIOV is not set. Fix it by adding a helper function bnxt_sriov_cfg() to handle the logic with or without the config option. Fixes: 46d08f55d24e ("bnxt_en: extend RTNL to VF check in devlink driver_reinit") Reported-by: kernel test robot Reviewed-by: Edwin Peer Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Link: https://lore.kernel.org/r/1637090770-22835-1-git-send-email-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 10 ++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index d0d5da9b78f8..4c9507d82fd0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2258,6 +2258,16 @@ static inline void bnxt_db_write(struct bnxt *bp, struct bnxt_db_info *db, } } +/* Must hold rtnl_lock */ +static inline bool bnxt_sriov_cfg(struct bnxt *bp) +{ +#if defined(CONFIG_BNXT_SRIOV) + return BNXT_PF(bp) && (bp->pf.active_vfs || bp->sriov_cfg); +#else + return false; +#endif +} + extern const u16 bnxt_lhint_arr[]; int bnxt_alloc_rx_data(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 6fe9e9b59f83..951c4c569a9b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -442,7 +442,7 @@ static int bnxt_dl_reload_down(struct devlink *dl, bool netns_change, switch (action) { case DEVLINK_RELOAD_ACTION_DRIVER_REINIT: { rtnl_lock(); - if (BNXT_PF(bp) && (bp->pf.active_vfs || bp->sriov_cfg)) { + if (bnxt_sriov_cfg(bp)) { NL_SET_ERR_MSG_MOD(extack, "reload is unsupported while VFs are allocated or being configured"); rtnl_unlock(); From 0a83f96f8709f65a6498a012ba49f608925dfae6 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 16 Nov 2021 17:13:03 +0300 Subject: [PATCH 193/336] MAINTAINERS: remove GR-everest-linux-l2@marvell.com I've sent a patch to GR-everest-linux-l2@marvell.com few days ago and got a reply from postmaster@marvell.com: Delivery has failed to these recipients or groups: gr-everest-linux-l2@marvell.com The email address you entered couldn't be found. Please check the recipient's email address and try to resend the message. If the problem continues, please contact your helpdesk. As requested by Alok Prasad, replacing GR-everest-linux-l2@marvell.com with Manish Chopra's email address. [0] Link: https://lore.kernel.org/all/20211116081601.11208-1-palok@marvell.com/ [0] Signed-off-by: Pavel Skripkin Link: https://lore.kernel.org/r/20211116141303.32180-1-paskripkin@gmail.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4c74516e4353..ae8b503ed64d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3733,7 +3733,7 @@ F: drivers/scsi/bnx2i/ BROADCOM BNX2X 10 GIGABIT ETHERNET DRIVER M: Ariel Elior M: Sudarsana Kalluru -M: GR-everest-linux-l2@marvell.com +M: Manish Chopra L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/bnx2x/ @@ -15592,7 +15592,7 @@ F: drivers/scsi/qedi/ QLOGIC QL4xxx ETHERNET DRIVER M: Ariel Elior -M: GR-everest-linux-l2@marvell.com +M: Manish Chopra L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/qlogic/qed/ From b0024a04e48837b6556a080ff37ecd8351632596 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 16 Nov 2021 16:09:23 +0000 Subject: [PATCH 194/336] amt: cancel delayed_work synchronously in amt_fini() When the amt module is being removed, it calls cancel_delayed_work() to cancel pending delayed_work. But this function doesn't wait for canceling delayed_work. So, workers can be still doing after module delete. In order to avoid this, cancel_delayed_work_sync() should be used instead. Suggested-by: Jakub Kicinski Fixes: bc54e49c140b ("amt: add multicast(IGMP) report message handler") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20211116160923.25258-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 47a04c330885..b732ee9a50ef 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3286,7 +3286,7 @@ static void __exit amt_fini(void) { rtnl_link_unregister(&amt_link_ops); unregister_netdevice_notifier(&amt_notifier_block); - cancel_delayed_work(&source_gc_wq); + cancel_delayed_work_sync(&source_gc_wq); __amt_source_gc_work(); destroy_workqueue(amt_wq); } From f799ada6bf2397c351220088b9b0980125c77280 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 12 Nov 2021 11:33:11 -0500 Subject: [PATCH 195/336] net: sched: act_mirred: drop dst for the direction from egress to ingress Without dropping dst, the packets sent from local mirred/redirected to ingress will may still use the old dst. ip_rcv() will drop it as the old dst is for output and its .input is dst_discard. This patch is to fix by also dropping dst for those packets that are mirred or redirected from egress to ingress in act_mirred. Note that we don't drop it for the direction change from ingress to egress, as on which there might be a user case attaching a metadata dst by act_tunnel_key that would be used later. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: Xin Long Acked-by: Cong Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- net/sched/act_mirred.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index d64b0eeccbe4..efc963ab995a 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -228,6 +229,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, bool want_ingress; bool is_redirect; bool expects_nh; + bool at_ingress; int m_eaction; int mac_len; bool at_nh; @@ -263,7 +265,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, * ingress - that covers the TC S/W datapath. */ is_redirect = tcf_mirred_is_act_redirect(m_eaction); - use_reinsert = skb_at_tc_ingress(skb) && is_redirect && + at_ingress = skb_at_tc_ingress(skb); + use_reinsert = at_ingress && is_redirect && tcf_mirred_can_reinsert(retval); if (!use_reinsert) { skb2 = skb_clone(skb, GFP_ATOMIC); @@ -271,10 +274,12 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + /* All mirred/redirected skbs should clear previous ct info */ nf_reset_ct(skb2); - - want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */ + skb_dst_drop(skb2); expects_nh = want_ingress || !m_mac_header_xmit; at_nh = skb->data == skb_network_header(skb); From 1d127effdc1750d1f43de42100185430ea0c90bf Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 12 Nov 2021 11:33:12 -0500 Subject: [PATCH 196/336] selftests: add a test case for mirred egress to ingress add a selftest that verifies the correct behavior of TC act_mirred egress to ingress: in particular, it checks if the dst_entry is removed from skb before redirect egress -> ingress. The correct behavior is: an ICMP 'echo request' generated by ping will be received and generate a reply the same way as the one generated by mausezahn. Suggested-by: Jamal Hadi Salim Signed-off-by: Davide Caratti Acked-by: Cong Wang Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/forwarding/config | 1 + .../selftests/net/forwarding/tc_actions.sh | 47 ++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/config b/tools/testing/selftests/net/forwarding/config index a4bd1b087303..697994a9278b 100644 --- a/tools/testing/selftests/net/forwarding/config +++ b/tools/testing/selftests/net/forwarding/config @@ -6,6 +6,7 @@ CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_NET_VRF=m CONFIG_BPF_SYSCALL=y CONFIG_CGROUP_BPF=y +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_MIRRED=m CONFIG_NET_ACT_MPLS=m CONFIG_NET_ACT_VLAN=m diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index d9eca227136b..de19eb6c38f0 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -3,7 +3,7 @@ ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ - gact_trap_test" + gact_trap_test mirred_egress_to_ingress_test" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -13,10 +13,12 @@ tcflags="skip_hw" h1_create() { simple_if_init $h1 192.0.2.1/24 + tc qdisc add dev $h1 clsact } h1_destroy() { + tc qdisc del dev $h1 clsact simple_if_fini $h1 192.0.2.1/24 } @@ -153,6 +155,49 @@ gact_trap_test() log_test "trap ($tcflags)" } +mirred_egress_to_ingress_test() +{ + RET=0 + + tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \ + ip_proto icmp src_ip 192.0.2.1 dst_ip 192.0.2.2 type 8 action \ + ct commit nat src addr 192.0.2.2 pipe \ + ct clear pipe \ + ct commit nat dst addr 192.0.2.1 pipe \ + mirred ingress redirect dev $h1 + + tc filter add dev $swp1 protocol ip pref 11 handle 111 ingress flower \ + ip_proto icmp src_ip 192.0.2.1 dst_ip 192.0.2.2 type 8 action drop + tc filter add dev $swp1 protocol ip pref 12 handle 112 ingress flower \ + ip_proto icmp src_ip 192.0.2.1 dst_ip 192.0.2.2 type 0 action pass + + $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ + -t icmp "ping,id=42,seq=10" -q + + tc_check_packets "dev $h1 egress" 100 1 + check_err $? "didn't mirror first packet" + + tc_check_packets "dev $swp1 ingress" 111 1 + check_fail $? "didn't redirect first packet" + tc_check_packets "dev $swp1 ingress" 112 1 + check_err $? "didn't receive reply to first packet" + + ping 192.0.2.2 -I$h1 -c1 -w1 -q 1>/dev/null 2>&1 + + tc_check_packets "dev $h1 egress" 100 2 + check_err $? "didn't mirror second packet" + tc_check_packets "dev $swp1 ingress" 111 1 + check_fail $? "didn't redirect second packet" + tc_check_packets "dev $swp1 ingress" 112 2 + check_err $? "didn't receive reply to second packet" + + tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower + tc filter del dev $swp1 ingress protocol ip pref 11 handle 111 flower + tc filter del dev $swp1 ingress protocol ip pref 12 handle 112 flower + + log_test "mirred_egress_to_ingress ($tcflags)" +} + setup_prepare() { h1=${NETIFS[p1]} From 3751c3d34cd5a750c86d1c8eaf217d8faf7f9325 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Nov 2021 16:21:23 +0100 Subject: [PATCH 197/336] net: stmmac: Fix signed/unsigned wreckage The recent addition of timestamp correction to compensate the CDC error introduced a subtle signed/unsigned bug in stmmac_get_tx_hwtstamp() while it managed for some obscure reason to avoid that in stmmac_get_rx_hwtstamp(). The issue is: s64 adjust = 0; u64 ns; adjust += -(2 * (NSEC_PER_SEC / priv->plat->clk_ptp_rate)); ns += adjust; works by chance on 64bit, but falls apart on 32bit because the compiler knows that adjust fits into 32bit and then treats the addition as a u64 + u32 resulting in an off by ~2 seconds failure. The RX variant uses an u64 for adjust and does the adjustment via ns -= adjust; because consistency is obviously overrated. Get rid of the pointless zero initialized adjust variable and do: ns -= (2 * NSEC_PER_SEC) / priv->plat->clk_ptp_rate; which is obviously correct and spares the adjust obfuscation. Aside of that it yields a more accurate result because the multiplication takes place before the integer divide truncation and not afterwards. Stick the calculation into an inline so it can't be accidentally disimproved. Return an u32 from that inline as the result is guaranteed to fit which lets the compiler optimize the substraction. Cc: stable@vger.kernel.org Fixes: 3600be5f58c1 ("net: stmmac: add timestamp correction to rid CDC sync error") Reported-by: Benedikt Spranger Signed-off-by: Thomas Gleixner Tested-by: Benedikt Spranger Tested-by: Kurt Kanzenbach # Intel EHL Link: https://lore.kernel.org/r/87mtm578cs.ffs@tglx Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d3f350c25b9b..2eb284576336 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -511,6 +511,14 @@ bool stmmac_eee_init(struct stmmac_priv *priv) return true; } +static inline u32 stmmac_cdc_adjust(struct stmmac_priv *priv) +{ + /* Correct the clk domain crossing(CDC) error */ + if (priv->plat->has_gmac4 && priv->plat->clk_ptp_rate) + return (2 * NSEC_PER_SEC) / priv->plat->clk_ptp_rate; + return 0; +} + /* stmmac_get_tx_hwtstamp - get HW TX timestamps * @priv: driver private structure * @p : descriptor pointer @@ -524,7 +532,6 @@ static void stmmac_get_tx_hwtstamp(struct stmmac_priv *priv, { struct skb_shared_hwtstamps shhwtstamp; bool found = false; - s64 adjust = 0; u64 ns = 0; if (!priv->hwts_tx_en) @@ -543,12 +550,7 @@ static void stmmac_get_tx_hwtstamp(struct stmmac_priv *priv, } if (found) { - /* Correct the clk domain crossing(CDC) error */ - if (priv->plat->has_gmac4 && priv->plat->clk_ptp_rate) { - adjust += -(2 * (NSEC_PER_SEC / - priv->plat->clk_ptp_rate)); - ns += adjust; - } + ns -= stmmac_cdc_adjust(priv); memset(&shhwtstamp, 0, sizeof(struct skb_shared_hwtstamps)); shhwtstamp.hwtstamp = ns_to_ktime(ns); @@ -573,7 +575,6 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p, { struct skb_shared_hwtstamps *shhwtstamp = NULL; struct dma_desc *desc = p; - u64 adjust = 0; u64 ns = 0; if (!priv->hwts_rx_en) @@ -586,11 +587,7 @@ static void stmmac_get_rx_hwtstamp(struct stmmac_priv *priv, struct dma_desc *p, if (stmmac_get_rx_timestamp_status(priv, p, np, priv->adv_ts)) { stmmac_get_timestamp(priv, desc, priv->adv_ts, &ns); - /* Correct the clk domain crossing(CDC) error */ - if (priv->plat->has_gmac4 && priv->plat->clk_ptp_rate) { - adjust += 2 * (NSEC_PER_SEC / priv->plat->clk_ptp_rate); - ns -= adjust; - } + ns -= stmmac_cdc_adjust(priv); netdev_dbg(priv->dev, "get valid RX hw timestamp %llu\n", ns); shhwtstamp = skb_hwtstamps(skb); From 963d0b3569354230f6e2c36a286ef270a8901878 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 16 Nov 2021 07:55:45 -0800 Subject: [PATCH 198/336] drm/scheduler: fix drm_sched_job_add_implicit_dependencies harder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_sched_job_add_dependency() could drop the last ref, so we need to do the dma_fence_get() first. Cc: Christian König Fixes: 9c2ba265352a ("drm/scheduler: use new iterator in drm_sched_job_add_implicit_dependencies v2") Signed-off-by: Rob Clark Link: https://patchwork.freedesktop.org/patch/msgid/20211116155545.473311-1-robdclark@gmail.com Reviewed-by: Daniel Vetter Reviewed-by: Christian König Tested-by: Amit Pundir Signed-off-by: Christian König --- drivers/gpu/drm/scheduler/sched_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 94fe51b3caa2..f91fb31ab7a7 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -704,12 +704,13 @@ int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job, int ret; dma_resv_for_each_fence(&cursor, obj->resv, write, fence) { - ret = drm_sched_job_add_dependency(job, fence); - if (ret) - return ret; - /* Make sure to grab an additional ref on the added fence */ dma_fence_get(fence); + ret = drm_sched_job_add_dependency(job, fence); + if (ret) { + dma_fence_put(fence); + return ret; + } } return 0; } From 968219708108440b23bc292e0486e3cc1d9a1bed Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 9 Nov 2021 15:57:12 +0100 Subject: [PATCH 199/336] fs: handle circular mappings correctly When calling setattr_prepare() to determine the validity of the attributes the ia_{g,u}id fields contain the value that will be written to inode->i_{g,u}id. When the {g,u}id attribute of the file isn't altered and the caller's fs{g,u}id matches the current {g,u}id attribute the attribute change is allowed. The value in ia_{g,u}id does already account for idmapped mounts and will have taken the relevant idmapping into account. So in order to verify that the {g,u}id attribute isn't changed we simple need to compare the ia_{g,u}id value against the inode's i_{g,u}id value. This only has any meaning for idmapped mounts as idmapping helpers are idempotent without them. And for idmapped mounts this really only has a meaning when circular idmappings are used, i.e. mappings where e.g. id 1000 is mapped to id 1001 and id 1001 is mapped to id 1000. Such ciruclar mappings can e.g. be useful when sharing the same home directory between multiple users at the same time. As an example consider a directory with two files: /source/file1 owned by {g,u}id 1000 and /source/file2 owned by {g,u}id 1001. Assume we create an idmapped mount at /target with an idmapping that maps files owned by {g,u}id 1000 to being owned by {g,u}id 1001 and files owned by {g,u}id 1001 to being owned by {g,u}id 1000. In effect, the idmapped mount at /target switches the ownership of /source/file1 and source/file2, i.e. /target/file1 will be owned by {g,u}id 1001 and /target/file2 will be owned by {g,u}id 1000. This means that a user with fs{g,u}id 1000 must be allowed to setattr /target/file2 from {g,u}id 1000 to {g,u}id 1000. Similar, a user with fs{g,u}id 1001 must be allowed to setattr /target/file1 from {g,u}id 1001 to {g,u}id 1001. Conversely, a user with fs{g,u}id 1000 must fail to setattr /target/file1 from {g,u}id 1001 to {g,u}id 1000. And a user with fs{g,u}id 1001 must fail to setattr /target/file2 from {g,u}id 1000 to {g,u}id 1000. Both cases must fail with EPERM for non-capable callers. Before this patch we could end up denying legitimate attribute changes and allowing invalid attribute changes when circular mappings are used. To even get into this situation the caller must've been privileged both to create that mapping and to create that idmapped mount. This hasn't been seen in the wild anywhere but came up when expanding the testsuite during work on a series of hardening patches. All idmapped fstests pass without any regressions and we add new tests to verify the behavior of circular mappings. Link: https://lore.kernel.org/r/20211109145713.1868404-1-brauner@kernel.org Fixes: 2f221d6f7b88 ("attr: handle idmapped mounts") Cc: Seth Forshee Cc: Christoph Hellwig Cc: Al Viro Cc: stable@vger.kernel.org CC: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig Acked-by: Seth Forshee Signed-off-by: Christian Brauner --- fs/attr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index 473d21b3a86d..66899b6e9bd8 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -35,7 +35,7 @@ static bool chown_ok(struct user_namespace *mnt_userns, kuid_t uid) { kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid)) + if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid)) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; @@ -62,7 +62,7 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, { kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && - (in_group_p(gid) || gid_eq(gid, kgid))) + (in_group_p(gid) || gid_eq(gid, inode->i_gid))) return true; if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; From fb561bf9abde49f7e00fdbf9ed2ccf2d86cac8ee Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Thu, 11 Nov 2021 12:57:57 +0100 Subject: [PATCH 200/336] fbdev: Prevent probing generic drivers if a FB is already registered The efifb and simplefb drivers just render to a pre-allocated frame buffer and rely on the display hardware being initialized before the kernel boots. But if another driver already probed correctly and registered a fbdev, the generic drivers shouldn't be probed since an actual driver for the display hardware is already present. This is more likely to occur after commit d391c5827107 ("drivers/firmware: move x86 Generic System Framebuffers support") since the "efi-framebuffer" and "simple-framebuffer" platform devices are registered at a later time. Link: https://lore.kernel.org/r/20211110200253.rfudkt3edbd3nsyj@lahvuun/ Fixes: d391c5827107 ("drivers/firmware: move x86 Generic System Framebuffers support") Reported-by: Ilya Trukhanov Cc: # 5.15.x Signed-off-by: Javier Martinez Canillas Reviewed-by: Daniel Vetter Tested-by: Ilya Trukhanov Link: https://patchwork.freedesktop.org/patch/msgid/20211111115757.1351045-1-javierm@redhat.com --- drivers/video/fbdev/efifb.c | 11 +++++++++++ drivers/video/fbdev/simplefb.c | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c index edca3703b964..ea42ba6445b2 100644 --- a/drivers/video/fbdev/efifb.c +++ b/drivers/video/fbdev/efifb.c @@ -351,6 +351,17 @@ static int efifb_probe(struct platform_device *dev) char *option = NULL; efi_memory_desc_t md; + /* + * Generic drivers must not be registered if a framebuffer exists. + * If a native driver was probed, the display hardware was already + * taken and attempting to use the system framebuffer is dangerous. + */ + if (num_registered_fb > 0) { + dev_err(&dev->dev, + "efifb: a framebuffer is already registered\n"); + return -EINVAL; + } + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || pci_dev_disabled) return -ENODEV; diff --git a/drivers/video/fbdev/simplefb.c b/drivers/video/fbdev/simplefb.c index 62f0ded70681..b63074fd892e 100644 --- a/drivers/video/fbdev/simplefb.c +++ b/drivers/video/fbdev/simplefb.c @@ -407,6 +407,17 @@ static int simplefb_probe(struct platform_device *pdev) struct simplefb_par *par; struct resource *mem; + /* + * Generic drivers must not be registered if a framebuffer exists. + * If a native driver was probed, the display hardware was already + * taken and attempting to use the system framebuffer is dangerous. + */ + if (num_registered_fb > 0) { + dev_err(&pdev->dev, + "simplefb: a framebuffer is already registered\n"); + return -EINVAL; + } + if (fb_get_options("simplefb", NULL)) return -ENODEV; From bec05f33ebc1006899c6d3e59a00c58881fe7626 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Sun, 14 Nov 2021 17:08:17 +0100 Subject: [PATCH 201/336] parisc/sticon: fix reverse colors sticon_build_attr() checked the reverse argument and flipped background and foreground color, but returned the non-reverse value afterwards. Fix this and also add two local variables for foreground and background color to make the code easier to read. Signed-off-by: Sven Schnelle Cc: Signed-off-by: Helge Deller --- drivers/video/console/sticon.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/video/console/sticon.c b/drivers/video/console/sticon.c index 1b451165311c..40496e9e9b43 100644 --- a/drivers/video/console/sticon.c +++ b/drivers/video/console/sticon.c @@ -332,13 +332,13 @@ static u8 sticon_build_attr(struct vc_data *conp, u8 color, bool blink, bool underline, bool reverse, bool italic) { - u8 attr = ((color & 0x70) >> 1) | ((color & 7)); + u8 fg = color & 7; + u8 bg = (color & 0x70) >> 4; - if (reverse) { - color = ((color >> 3) & 0x7) | ((color & 0x7) << 3); - } - - return attr; + if (reverse) + return (fg << 3) | bg; + else + return (bg << 3) | fg; } static void sticon_invert_region(struct vc_data *conp, u16 *p, int count) From 8e80a73fa9a7747e3e8255cb149c543aabf65a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 16 Nov 2021 14:40:22 +0100 Subject: [PATCH 202/336] powerpc/xive: Change IRQ domain to a tree domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4f86a06e2d6e ("irqdomain: Make normal and nomap irqdomains exclusive") introduced an IRQ_DOMAIN_FLAG_NO_MAP flag to isolate the 'nomap' domains still in use under the powerpc arch. With this new flag, the revmap_tree of the IRQ domain is not used anymore. This change broke the support of shared LSIs [1] in the XIVE driver because it was relying on a lookup in the revmap_tree to query previously mapped interrupts. Linux now creates two distinct IRQ mappings on the same HW IRQ which can lead to unexpected behavior in the drivers. The XIVE IRQ domain is not a direct mapping domain and its HW IRQ interrupt number space is rather large : 1M/socket on POWER9 and POWER10, change the XIVE driver to use a 'tree' domain type instead. [1] For instance, a linux KVM guest with virtio-rng and virtio-balloon devices. Fixes: 4f86a06e2d6e ("irqdomain: Make normal and nomap irqdomains exclusive") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Cédric Le Goater Tested-by: Greg Kurz Acked-by: Marc Zyngier Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211116134022.420412-1-clg@kaod.org --- arch/powerpc/sysdev/xive/Kconfig | 1 - arch/powerpc/sysdev/xive/common.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig index 97796c6b63f0..785c292d104b 100644 --- a/arch/powerpc/sysdev/xive/Kconfig +++ b/arch/powerpc/sysdev/xive/Kconfig @@ -3,7 +3,6 @@ config PPC_XIVE bool select PPC_SMP_MUXED_IPI select HARDIRQS_SW_RESEND - select IRQ_DOMAIN_NOMAP config PPC_XIVE_NATIVE bool diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index c5d75c02ad8b..7b69299c2912 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1443,8 +1443,7 @@ static const struct irq_domain_ops xive_irq_domain_ops = { static void __init xive_init_host(struct device_node *np) { - xive_irq_domain = irq_domain_add_nomap(np, XIVE_MAX_IRQ, - &xive_irq_domain_ops, NULL); + xive_irq_domain = irq_domain_add_tree(np, &xive_irq_domain_ops, NULL); if (WARN_ON(xive_irq_domain == NULL)) return; irq_set_default_host(xive_irq_domain); From 1c1c3c7d08d8f51c8317119ccba8c93effc02c2b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 16 Nov 2021 12:11:20 +0000 Subject: [PATCH 203/336] libbpf: update index.rst reference Changeset d20b41115ad5 ("libbpf: Rename libbpf documentation index file") renamed: Documentation/bpf/libbpf/libbpf.rst to: Documentation/bpf/libbpf/index.rst. Update its cross-reference accordingly. Fixes: d20b41115ad5 ("libbpf: Rename libbpf documentation index file") Acked-by: Andrii Nakryiko Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/bpf/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 37f273a7e8b6..610450f59e05 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -15,7 +15,7 @@ that goes into great technical depth about the BPF Architecture. libbpf ====== -Documentation/bpf/libbpf/libbpf.rst is a userspace library for loading and interacting with bpf programs. +Documentation/bpf/libbpf/index.rst is a userspace library for loading and interacting with bpf programs. BPF Type Format (BTF) ===================== From 0f60a29c52b515532e6b11dc6b3c9e5b5f7ff2b4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 16 Nov 2021 12:11:21 +0000 Subject: [PATCH 204/336] docs: accounting: update delay-accounting.rst reference The file name: accounting/delay-accounting.rst should be, instead: Documentation/accounting/delay-accounting.rst. Also, there's no need to use doc:`foo`, as automarkup.py will automatically handle plain text mentions to Documentation/ files. So, update its cross-reference accordingly. Fixes: fcb501704554 ("delayacct: Document task_delayacct sysctl") Fixes: c3123552aad3 ("docs: accounting: convert to ReST") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/sysctl/kernel.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 426162009ce9..0e486f41185e 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1099,7 +1099,7 @@ task_delayacct =============== Enables/disables task delay accounting (see -:doc:`accounting/delay-accounting.rst`). Enabling this feature incurs +Documentation/accounting/delay-accounting.rst. Enabling this feature incurs a small amount of overhead in the scheduler but is useful for debugging and performance tuning. It is required by some tools such as iotop. From 636e36b19d3fac05a21f6a1d2236a2e0b04fe52d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 16 Nov 2021 12:11:22 +0000 Subject: [PATCH 205/336] Documentation: update vcpu-requests.rst reference Changeset 2f5947dfcaec ("Documentation: move Documentation/virtual to Documentation/virt") renamed: Documentation/virtual/kvm/vcpu-requests.rst to: Documentation/virt/kvm/vcpu-requests.rst. Update its cross-reference accordingly. Fixes: 2f5947dfcaec ("Documentation: move Documentation/virtual to Documentation/virt") Reviewed-by: Anup Patel Signed-off-by: Mauro Carvalho Chehab Acked-by: Paolo Bonzini Signed-off-by: Jonathan Corbet --- arch/riscv/kvm/vcpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index e3d3aed46184..fb84619df012 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -740,7 +740,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and - * Documentation/virtual/kvm/vcpu-requests.rst + * Documentation/virt/kvm/vcpu-requests.rst */ vcpu->mode = IN_GUEST_MODE; From b96ff02ab2be1791248237b1bf318aaf62e8b701 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 16 Nov 2021 12:11:23 +0000 Subject: [PATCH 206/336] Documentation/process: fix a cross reference The cross-reference for the handbooks section works. However, it is meant to describe the path inside the Kernel's doc where the section is, but there's an space instead of a dash, plus it lacks the .rst at the end, which makes: ./scripts/documentation-file-ref-check to complain. Fixes: 604370e106cc ("Documentation/process: Add maintainer handbooks section") Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jonathan Corbet --- Documentation/process/submitting-patches.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst index a0cc96923ea7..da085d63af9b 100644 --- a/Documentation/process/submitting-patches.rst +++ b/Documentation/process/submitting-patches.rst @@ -22,8 +22,8 @@ use it, it will make your life as a kernel developer and in general much easier. Some subsystems and maintainer trees have additional information about -their workflow and expectations, see :ref:`Documentation/process/maintainer -handbooks `. +their workflow and expectations, see +:ref:`Documentation/process/maintainer-handbooks.rst `. Obtain a current source tree ---------------------------- From f15863b27752682bb700c21de5f83f613a0fb77e Mon Sep 17 00:00:00 2001 From: Vandita Kulkarni Date: Tue, 9 Nov 2021 17:34:28 +0530 Subject: [PATCH 207/336] Revert "drm/i915/tgl/dsi: Gate the ddi clocks after pll mapping" This reverts commit 991d9557b0c4 ("drm/i915/tgl/dsi: Gate the ddi clocks after pll mapping"). The Bspec was updated recently with the pll ungate sequence similar to that of icl dsi enable sequence. Hence reverting. Bspec: 49187 Fixes: 991d9557b0c4 ("drm/i915/tgl/dsi: Gate the ddi clocks after pll mapping") Cc: # v5.4+ Signed-off-by: Vandita Kulkarni Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20211109120428.15211-1-vandita.kulkarni@intel.com (cherry picked from commit 4579509ef181480f4e4510d436c691519167c5c2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/icl_dsi.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index 168c84a74d30..00f270c41894 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -696,10 +696,7 @@ static void gen11_dsi_map_pll(struct intel_encoder *encoder, intel_de_write(dev_priv, ICL_DPCLKA_CFGCR0, val); for_each_dsi_phy(phy, intel_dsi->phys) { - if (DISPLAY_VER(dev_priv) >= 12) - val |= ICL_DPCLKA_CFGCR0_DDI_CLK_OFF(phy); - else - val &= ~ICL_DPCLKA_CFGCR0_DDI_CLK_OFF(phy); + val &= ~ICL_DPCLKA_CFGCR0_DDI_CLK_OFF(phy); } intel_de_write(dev_priv, ICL_DPCLKA_CFGCR0, val); @@ -1135,8 +1132,6 @@ static void gen11_dsi_enable_port_and_phy(struct intel_encoder *encoder, const struct intel_crtc_state *crtc_state) { - struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); - /* step 4a: power up all lanes of the DDI used by DSI */ gen11_dsi_power_up_lanes(encoder); @@ -1162,8 +1157,7 @@ gen11_dsi_enable_port_and_phy(struct intel_encoder *encoder, gen11_dsi_configure_transcoder(encoder, crtc_state); /* Step 4l: Gate DDI clocks */ - if (DISPLAY_VER(dev_priv) == 11) - gen11_dsi_gate_clocks(encoder); + gen11_dsi_gate_clocks(encoder); } static void gen11_dsi_powerup_panel(struct intel_encoder *encoder) From d33233d8782ede666b54f655522064d000767f74 Mon Sep 17 00:00:00 2001 From: Vandita Kulkarni Date: Tue, 19 Oct 2021 20:44:32 +0530 Subject: [PATCH 208/336] drm/i915/dsi/xelpd: Fix the bit mask for wakeup GB v2: Fix the typo, move out the hardcoding from macro(Jani, Ville) Fixes: f87c46c43175 ("drm/i915/dsi/xelpd: Add WA to program LP to HS wakeup guardband") Signed-off-by: Vandita Kulkarni Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20211019151435.20477-2-vandita.kulkarni@intel.com (cherry picked from commit 6f07707fa09e1dc58c431d57c25ef2e68b9bec47) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/display/icl_dsi.c | 3 ++- drivers/gpu/drm/i915/i915_reg.h | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index 00f270c41894..71fbdcddd31f 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -1265,7 +1265,8 @@ static void adlp_set_lp_hs_wakeup_gb(struct intel_encoder *encoder) if (DISPLAY_VER(i915) == 13) { for_each_dsi_port(port, intel_dsi->ports) intel_de_rmw(i915, TGL_DSI_CHKN_REG(port), - TGL_DSI_CHKN_LSHS_GB, 0x4); + TGL_DSI_CHKN_LSHS_GB_MASK, + TGL_DSI_CHKN_LSHS_GB(4)); } } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index da9055c3ebf0..bcee121bec5a 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -11717,7 +11717,9 @@ enum skl_power_gate { #define TGL_DSI_CHKN_REG(port) _MMIO_PORT(port, \ _TGL_DSI_CHKN_REG_0, \ _TGL_DSI_CHKN_REG_1) -#define TGL_DSI_CHKN_LSHS_GB REG_GENMASK(15, 12) +#define TGL_DSI_CHKN_LSHS_GB_MASK REG_GENMASK(15, 12) +#define TGL_DSI_CHKN_LSHS_GB(byte_clocks) REG_FIELD_PREP(TGL_DSI_CHKN_LSHS_GB_MASK, \ + (byte_clocks)) /* Display Stream Splitter Control */ #define DSS_CTL1 _MMIO(0x67400) From 8b2abf777d8ea8d8db15af553454e0e976804225 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 16 Nov 2021 14:49:16 +0300 Subject: [PATCH 209/336] drm/i915/guc: fix NULL vs IS_ERR() checking The intel_engine_create_virtual() function does not return NULL. It returns error pointers. Fixes: e5e32171a2cf ("drm/i915/guc: Connect UAPI to GuC multi-lrc interface") Signed-off-by: Dan Carpenter Reviewed-by: Rodrigo Vivi Signed-off-by: John Harrison Link: https://patchwork.freedesktop.org/patch/msgid/20211116114916.GB11936@kili (cherry picked from commit fc12b70d12d07598cde27cc17dbfafc2a2a33ff8) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 38b47e73e35d..c48557dfa04c 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -3080,8 +3080,8 @@ guc_create_parallel(struct intel_engine_cs **engines, ce = intel_engine_create_virtual(siblings, num_siblings, FORCE_VIRTUAL); - if (!ce) { - err = ERR_PTR(-ENOMEM); + if (IS_ERR(ce)) { + err = ERR_CAST(ce); goto unwind; } From e324234e0aa881b7841c7c713306403e12b069ff Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 15 Nov 2021 12:03:32 +0300 Subject: [PATCH 210/336] perf/x86/intel/uncore: Fix filter_tid mask for CHA events on Skylake Server According Uncore Reference Manual: any of the CHA events may be filtered by Thread/Core-ID by using tid modifier in CHA Filter 0 Register. Update skx_cha_hw_config() to follow Uncore Guide. Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20211115090334.3789-2-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index eb2c6cea9d0d..e5ee6bb62ef5 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3608,6 +3608,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct extra_reg *er; int idx = 0; + /* Any of the CHA events may be filtered by Thread/Core-ID.*/ + if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN) + idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID; for (er = skx_uncore_cha_extra_regs; er->msr; er++) { if (er->event != (event->hw.config & er->config_mask)) From 3866ae319c846a612109c008f43cba80b8c15e86 Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 15 Nov 2021 12:03:33 +0300 Subject: [PATCH 211/336] perf/x86/intel/uncore: Fix IIO event constraints for Skylake Server According to the latest uncore document, COMP_BUF_OCCUPANCY (0xd5) event can be collected on 2-3 counters. Update uncore IIO event constraints for Skylake Server. Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20211115090334.3789-3-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index e5ee6bb62ef5..9aba4ef77b13 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3678,6 +3678,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; From bdc0feee05174418dec1fa68de2af19e1750b99f Mon Sep 17 00:00:00 2001 From: Alexander Antonov Date: Mon, 15 Nov 2021 12:03:34 +0300 Subject: [PATCH 212/336] perf/x86/intel/uncore: Fix IIO event constraints for Snowridge According to the latest uncore document, DATA_REQ_OF_CPU (0x83), DATA_REQ_BY_CPU (0xc0) and COMP_BUF_OCCUPANCY (0xd5) events have constraints. Add uncore IIO constraints for Snowridge. Fixes: 210cc5f9db7a ("perf/x86/intel/uncore: Add uncore support for Snow Ridge server") Signed-off-by: Alexander Antonov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20211115090334.3789-4-alexander.antonov@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 9aba4ef77b13..3660f698fb2a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4529,6 +4529,13 @@ static void snr_iio_cleanup_mapping(struct intel_uncore_type *type) pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group); } +static struct event_constraint snr_uncore_iio_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type snr_uncore_iio = { .name = "iio", .num_counters = 4, @@ -4540,6 +4547,7 @@ static struct intel_uncore_type snr_uncore_iio = { .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, .box_ctl = SNR_IIO_MSR_PMON_BOX_CTL, .msr_offset = SNR_IIO_MSR_OFFSET, + .constraints = snr_uncore_iio_constraints, .ops = &ivbep_uncore_msr_ops, .format_group = &snr_uncore_iio_format_group, .attr_update = snr_iio_attr_update, From f3fd84a3b7754b60df67ebfe64e1d90623895111 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 11 Nov 2021 21:45:10 -0800 Subject: [PATCH 213/336] x86/perf: Fix snapshot_branch_stack warning in VM When running in VM intel_pmu_snapshot_branch_stack triggers WRMSR warning like: [ ] unchecked MSR access error: WRMSR to 0x3f1 (tried to write 0x0000000000000000) at rIP: 0xffffffff81011a5b (intel_pmu_snapshot_branch_stack+0x3b/0xd0) This can be triggered with BPF selftests: tools/testing/selftests/bpf/test_progs -t get_branch_snapshot This warning is caused by __intel_pmu_pebs_disable_all() in the VM. Since it is not necessary to disable PEBS for LBR, remove it from intel_pmu_snapshot_branch_stack and intel_pmu_snapshot_arch_branch_stack. Fixes: c22ac2a3d4bd ("perf: Enable branch record for software events") Signed-off-by: Song Liu Signed-off-by: Peter Zijlstra (Intel) Tested-by: Like Xu Link: https://lore.kernel.org/r/20211112054510.2667030-1-songliubraving@fb.com --- arch/x86/events/intel/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 42cf01ecdd13..ec6444f2c9dc 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2211,7 +2211,6 @@ intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int /* must not have branches... */ local_irq_save(flags); __intel_pmu_disable_all(false); /* we don't care about BTS */ - __intel_pmu_pebs_disable_all(); __intel_pmu_lbr_disable(); /* ... until here */ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); @@ -2225,7 +2224,6 @@ intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned /* must not have branches... */ local_irq_save(flags); __intel_pmu_disable_all(false); /* we don't care about BTS */ - __intel_pmu_pebs_disable_all(); __intel_pmu_arch_lbr_disable(); /* ... until here */ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); From 245a489e81e13dd55ae46d27becf6d5901eb7828 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 17 Nov 2021 19:55:02 +0800 Subject: [PATCH 214/336] block: avoid to quiesce queue in elevator_init_mq elevator_init_mq() is only called before adding disk, when there isn't any FS I/O, only passthrough requests can be queued, so freezing queue plus canceling dispatch work is enough to drain any dispatch activities, then we can avoid synchronize_srcu() in blk_mq_quiesce_queue(). Long boot latency issue can be fixed in case of lots of disks added during booting. Fixes: 737eb78e82d5 ("block: Delay default elevator initialization") Reported-by: yangerkun Cc: Damien Le Moal Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20211117115502.1600950-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/elevator.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index 1f39f6e8ebb9..19a78d5516ba 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -694,12 +694,18 @@ void elevator_init_mq(struct request_queue *q) if (!e) return; + /* + * We are called before adding disk, when there isn't any FS I/O, + * so freezing queue plus canceling dispatch work is enough to + * drain any dispatch activities originated from passthrough + * requests, then no need to quiesce queue which may add long boot + * latency, especially when lots of disks are involved. + */ blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); if (err) { From f77b83b5bbab53d2be339184838b19ed2c62c0a5 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Tue, 16 Nov 2021 22:19:17 +0800 Subject: [PATCH 215/336] net: usb: r8152: Add MAC passthrough support for more Lenovo Docks Like ThinkaPad Thunderbolt 4 Dock, more Lenovo docks start to use the original Realtek USB ethernet chip ID 0bda:8153. Lenovo Docks always use their own IDs for usb hub, even for older Docks. If parent hub is from Lenovo, then r8152 should try MAC passthrough. Verified on Lenovo TBT3 dock too. Signed-off-by: Aaron Ma Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 4a02f33f0643..f9877a3e83ac 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -9603,12 +9603,9 @@ static int rtl8152_probe(struct usb_interface *intf, netdev->hw_features &= ~NETIF_F_RXCSUM; } - if (le16_to_cpu(udev->descriptor.idVendor) == VENDOR_ID_LENOVO) { - switch (le16_to_cpu(udev->descriptor.idProduct)) { - case DEVICE_ID_THINKPAD_THUNDERBOLT3_DOCK_GEN2: - case DEVICE_ID_THINKPAD_USB_C_DOCK_GEN2: - tp->lenovo_macpassthru = 1; - } + if (udev->parent && + le16_to_cpu(udev->parent->descriptor.idVendor) == VENDOR_ID_LENOVO) { + tp->lenovo_macpassthru = 1; } if (le16_to_cpu(udev->descriptor.bcdDevice) == 0x3011 && udev->serial && From 9b5a333272a48c2f8b30add7a874e46e8b26129c Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 16 Nov 2021 18:17:12 +0300 Subject: [PATCH 216/336] net: dpaa2-eth: fix use-after-free in dpaa2_eth_remove Access to netdev after free_netdev() will cause use-after-free bug. Move debug log before free_netdev() call to avoid it. Fixes: 7472dd9f6499 ("staging: fsl-dpaa2/eth: Move print message") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 714e961e7a77..6451c8383639 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -4550,10 +4550,10 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev) fsl_mc_portal_free(priv->mc_io); - free_netdev(net_dev); - dev_dbg(net_dev->dev.parent, "Removed interface %s\n", net_dev->name); + free_netdev(net_dev); + return 0; } From cf9acc90c80ecbee00334aa85d92f4e74014bcff Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Tue, 16 Nov 2021 17:42:42 +0000 Subject: [PATCH 217/336] net: virtio_net_hdr_to_skb: count transport header in UFO virtio_net_hdr_to_skb does not set the skb's gso_size and gso_type correctly for UFO packets received via virtio-net that are a little over the GSO size. This can lead to problems elsewhere in the networking stack, e.g. ovs_vport_send dropping over-sized packets if gso_size is not set. This is due to the comparison if (skb->len - p_off > gso_size) not properly accounting for the transport layer header. p_off includes the size of the transport layer header (thlen), so skb->len - p_off is the size of the TCP/UDP payload. gso_size is read from the virtio-net header. For UFO, fragmentation happens at the IP level so does not need to include the UDP header. Hence the calculation could be comparing a TCP/UDP payload length with an IP payload length, causing legitimate virtio-net packets to have lack gso_type/gso_size information. Example: a UDP packet with payload size 1473 has IP payload size 1481. If the guest used UFO, it is not fragmented and the virtio-net header's flags indicate that it is a GSO frame (VIRTIO_NET_HDR_GSO_UDP), with gso_size = 1480 for an MTU of 1500. skb->len will be 1515 and p_off will be 42, so skb->len - p_off = 1473. Hence the comparison fails, and shinfo->gso_size and gso_type are not set as they should be. Instead, add the UDP header length before comparing to gso_size when using UFO. In this way, it is the size of the IP payload that is compared to gso_size. Fixes: 6dd912f82680 ("net: check untrusted gso_size at kernel entry") Signed-off-by: Jonathan Davies Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index b465f8f3e554..04e87f4b9417 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -120,10 +120,15 @@ retry: if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); + unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); + /* UFO may not include transport header in gso_size. */ + if (gso_type & SKB_GSO_UDP) + nh_off -= thlen; + /* Too small packets are not really GSO ones. */ - if (skb->len - p_off > gso_size) { + if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; shinfo->gso_type = gso_type; From c366ce28750e9633f8d4b07829a9cde0e59034eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Stelmach?= Date: Tue, 16 Nov 2021 22:29:15 +0100 Subject: [PATCH 218/336] net: ax88796c: use bit numbers insetad of bit masks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change the values of EVENT_* constants from bit masks to bit numbers as accepted by {clear,set,test}_bit() functions. Reported-by: Dan Carpenter Signed-off-by: Łukasz Stelmach Signed-off-by: David S. Miller --- drivers/net/ethernet/asix/ax88796c_main.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/asix/ax88796c_main.h b/drivers/net/ethernet/asix/ax88796c_main.h index 80263c3cef75..4a83c991dcbe 100644 --- a/drivers/net/ethernet/asix/ax88796c_main.h +++ b/drivers/net/ethernet/asix/ax88796c_main.h @@ -127,9 +127,9 @@ struct ax88796c_device { #define AX_PRIV_FLAGS_MASK (AX_CAP_COMP) unsigned long flags; - #define EVENT_INTR BIT(0) - #define EVENT_TX BIT(1) - #define EVENT_SET_MULTI BIT(2) + #define EVENT_INTR 0 + #define EVENT_TX 1 + #define EVENT_SET_MULTI 2 }; From 6afbd7b3c53cb7417189f476e99d431daccb85b0 Mon Sep 17 00:00:00 2001 From: Eryk Rybak Date: Thu, 21 Jan 2021 16:17:22 +0000 Subject: [PATCH 219/336] i40e: Fix correct max_pkt_size on VF RX queue Setting VLAN port increasing RX queue max_pkt_size by 4 bytes to take VLAN tag into account. Trigger the VF reset when setting port VLAN for VF to renegotiate its capabilities and reinitialize. Fixes: ba4e003d29c1 ("i40e: don't hold spinlock while resetting VF") Signed-off-by: Sylwester Dziedziuch Signed-off-by: Aleksandr Loktionov Signed-off-by: Eryk Rybak Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 53 ++++--------------- 1 file changed, 9 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 472f56b360b8..815661632e7a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -674,14 +674,13 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, u16 vsi_queue_id, struct virtchnl_rxq_info *info) { + u16 pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id); struct i40e_pf *pf = vf->pf; + struct i40e_vsi *vsi = pf->vsi[vf->lan_vsi_idx]; struct i40e_hw *hw = &pf->hw; struct i40e_hmc_obj_rxq rx_ctx; - u16 pf_queue_id; int ret = 0; - pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id); - /* clear the context structure first */ memset(&rx_ctx, 0, sizeof(struct i40e_hmc_obj_rxq)); @@ -719,6 +718,10 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, } rx_ctx.rxmax = info->max_pkt_size; + /* if port VLAN is configured increase the max packet size */ + if (vsi->info.pvid) + rx_ctx.rxmax += VLAN_HLEN; + /* enable 32bytes desc always */ rx_ctx.dsize = 1; @@ -4169,34 +4172,6 @@ error_param: return ret; } -/** - * i40e_vsi_has_vlans - True if VSI has configured VLANs - * @vsi: pointer to the vsi - * - * Check if a VSI has configured any VLANs. False if we have a port VLAN or if - * we have no configured VLANs. Do not call while holding the - * mac_filter_hash_lock. - */ -static bool i40e_vsi_has_vlans(struct i40e_vsi *vsi) -{ - bool have_vlans; - - /* If we have a port VLAN, then the VSI cannot have any VLANs - * configured, as all MAC/VLAN filters will be assigned to the PVID. - */ - if (vsi->info.pvid) - return false; - - /* Since we don't have a PVID, we know that if the device is in VLAN - * mode it must be because of a VLAN filter configured on this VSI. - */ - spin_lock_bh(&vsi->mac_filter_hash_lock); - have_vlans = i40e_is_vsi_in_vlan(vsi); - spin_unlock_bh(&vsi->mac_filter_hash_lock); - - return have_vlans; -} - /** * i40e_ndo_set_vf_port_vlan * @netdev: network interface device structure @@ -4253,19 +4228,9 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, /* duplicate request, so just return success */ goto error_pvid; - if (i40e_vsi_has_vlans(vsi)) { - dev_err(&pf->pdev->dev, - "VF %d has already configured VLAN filters and the administrator is requesting a port VLAN override.\nPlease unload and reload the VF driver for this change to take effect.\n", - vf_id); - /* Administrator Error - knock the VF offline until he does - * the right thing by reconfiguring his network correctly - * and then reloading the VF driver. - */ - i40e_vc_disable_vf(vf); - /* During reset the VF got a new VSI, so refresh the pointer. */ - vsi = pf->vsi[vf->lan_vsi_idx]; - } - + i40e_vc_disable_vf(vf); + /* During reset the VF got a new VSI, so refresh a pointer. */ + vsi = pf->vsi[vf->lan_vsi_idx]; /* Locked once because multiple functions below iterate list */ spin_lock_bh(&vsi->mac_filter_hash_lock); From 37d9e304acd903a445df8208b8a13d707902dea6 Mon Sep 17 00:00:00 2001 From: Michal Maloszewski Date: Wed, 24 Feb 2021 12:07:48 +0000 Subject: [PATCH 220/336] i40e: Fix NULL ptr dereference on VSI filter sync Remove the reason of null pointer dereference in sync VSI filters. Added new I40E_VSI_RELEASING flag to signalize deleting and releasing of VSI resources to sync this thread with sync filters subtask. Without this patch it is possible to start update the VSI filter list after VSI is removed, that's causing a kernel oops. Fixes: 41c445ff0f48 ("i40e: main driver core") Signed-off-by: Grzegorz Szczurek Signed-off-by: Michal Maloszewski Reviewed-by: Przemyslaw Patynowski Reviewed-by: Witold Fijalkowski Reviewed-by: Jaroslaw Gawin Reviewed-by: Aleksandr Loktionov Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 3d528fba754b..35a83a161b6f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -161,6 +161,7 @@ enum i40e_vsi_state_t { __I40E_VSI_OVERFLOW_PROMISC, __I40E_VSI_REINIT_REQUESTED, __I40E_VSI_DOWN_REQUESTED, + __I40E_VSI_RELEASING, /* This must be last as it determines the size of the BITMAP */ __I40E_VSI_STATE_SIZE__, }; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index ba862131b9bd..6e309d6ce37d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2623,7 +2623,8 @@ static void i40e_sync_filters_subtask(struct i40e_pf *pf) for (v = 0; v < pf->num_alloc_vsi; v++) { if (pf->vsi[v] && - (pf->vsi[v]->flags & I40E_VSI_FLAG_FILTER_CHANGED)) { + (pf->vsi[v]->flags & I40E_VSI_FLAG_FILTER_CHANGED) && + !test_bit(__I40E_VSI_RELEASING, pf->vsi[v]->state)) { int ret = i40e_sync_vsi_filters(pf->vsi[v]); if (ret) { @@ -13771,7 +13772,7 @@ int i40e_vsi_release(struct i40e_vsi *vsi) dev_info(&pf->pdev->dev, "Can't remove PF VSI\n"); return -ENODEV; } - + set_bit(__I40E_VSI_RELEASING, vsi->state); uplink_seid = vsi->uplink_seid; if (vsi->type != I40E_VSI_SRIOV) { if (vsi->netdev_registered) { From d2a69fefd75683004ffe87166de5635b3267ee07 Mon Sep 17 00:00:00 2001 From: Eryk Rybak Date: Fri, 23 Apr 2021 13:43:25 +0200 Subject: [PATCH 221/336] i40e: Fix changing previously set num_queue_pairs for PFs Currently, the i40e_vsi_setup_queue_map is basing the count of queues in TCs on a VSI's alloc_queue_pairs member which is not changed throughout any user's action (for example via ethtool's set_channels callback). This implies that vsi->tc_config.tc_info[n].qcount value that is given to the kernel via netdev_set_tc_queue() that notifies about the count of queues per particular traffic class is constant even if user has changed the total count of queues. This in turn caused the kernel warning after setting the queue count to the lower value than the initial one: $ ethtool -l ens801f0 Channel parameters for ens801f0: Pre-set maximums: RX: 0 TX: 0 Other: 1 Combined: 64 Current hardware settings: RX: 0 TX: 0 Other: 1 Combined: 64 $ ethtool -L ens801f0 combined 40 [dmesg] Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled! Reason was that vsi->alloc_queue_pairs stayed at 64 value which was used to set the qcount on TC0 (by default only TC0 exists so all of the existing queues are assigned to TC0). we update the offset/qcount via netdev_set_tc_queue() back to the old value but then the netif_set_real_num_tx_queues() is using the vsi->num_queue_pairs as a value which got set to 40. Fix it by using vsi->req_queue_pairs as a queue count that will be distributed across TCs. Do it only for non-zero values, which implies that user actually requested the new count of queues. For VSIs other than main, stay with the vsi->alloc_queue_pairs as we only allow manipulating the queue count on main VSI. Fixes: bc6d33c8d93f ("i40e: Fix the number of queues available to be mapped for use") Co-developed-by: Maciej Fijalkowski Signed-off-by: Maciej Fijalkowski Co-developed-by: Przemyslaw Patynowski Signed-off-by: Przemyslaw Patynowski Signed-off-by: Eryk Rybak Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 35 ++++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 6e309d6ce37d..8437cc14bfc6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1790,6 +1790,7 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, bool is_add) { struct i40e_pf *pf = vsi->back; + u16 num_tc_qps = 0; u16 sections = 0; u8 netdev_tc = 0; u16 numtc = 1; @@ -1797,13 +1798,29 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, u8 offset; u16 qmap; int i; - u16 num_tc_qps = 0; sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; offset = 0; + if (vsi->type == I40E_VSI_MAIN) { + /* This code helps add more queue to the VSI if we have + * more cores than RSS can support, the higher cores will + * be served by ATR or other filters. Furthermore, the + * non-zero req_queue_pairs says that user requested a new + * queue count via ethtool's set_channels, so use this + * value for queues distribution across traffic classes + */ + if (vsi->req_queue_pairs > 0) + vsi->num_queue_pairs = vsi->req_queue_pairs; + else if (pf->flags & I40E_FLAG_MSIX_ENABLED) + vsi->num_queue_pairs = pf->num_lan_msix; + } + /* Number of queues per enabled TC */ - num_tc_qps = vsi->alloc_queue_pairs; + if (vsi->type == I40E_VSI_MAIN) + num_tc_qps = vsi->num_queue_pairs; + else + num_tc_qps = vsi->alloc_queue_pairs; if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) { /* Find numtc from enabled TC bitmap */ for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) { @@ -1881,16 +1898,10 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, } ctxt->info.tc_mapping[i] = cpu_to_le16(qmap); } - - /* Set actual Tx/Rx queue pairs */ - vsi->num_queue_pairs = offset; - if ((vsi->type == I40E_VSI_MAIN) && (numtc == 1)) { - if (vsi->req_queue_pairs > 0) - vsi->num_queue_pairs = vsi->req_queue_pairs; - else if (pf->flags & I40E_FLAG_MSIX_ENABLED) - vsi->num_queue_pairs = pf->num_lan_msix; - } - + /* Do not change previously set num_queue_pairs for PFs */ + if ((vsi->type == I40E_VSI_MAIN && numtc != 1) || + vsi->type != I40E_VSI_MAIN) + vsi->num_queue_pairs = offset; /* Scheduler section valid can only be set for ADD VSI */ if (is_add) { sections |= I40E_AQ_VSI_PROP_SCHED_VALID; From 9e0a603cb7dce2a19d98116d42de84b6db26d716 Mon Sep 17 00:00:00 2001 From: Eryk Rybak Date: Fri, 23 Apr 2021 13:43:26 +0200 Subject: [PATCH 222/336] i40e: Fix ping is lost after configuring ADq on VF Properly reconfigure VF VSIs after VF request ADQ. Created new function to update queue mapping and queue pairs per TC with AQ update VSI. This sets proper RSS size on NIC. VFs num_queue_pairs should not be changed during setup of queue maps. Previously, VF main VSI in ADQ had configured too many queues and had wrong RSS size, which lead to packets not being consumed and drops in connectivity. Fixes: bc6d33c8d93f ("i40e: Fix the number of queues available to be mapped for use") Co-developed-by: Przemyslaw Patynowski Signed-off-by: Przemyslaw Patynowski Signed-off-by: Eryk Rybak Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 64 ++++++++++++++++++- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 17 +++-- 3 files changed, 74 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 35a83a161b6f..4d939af0a626 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1248,6 +1248,7 @@ void i40e_ptp_restore_hw_time(struct i40e_pf *pf); void i40e_ptp_init(struct i40e_pf *pf); void i40e_ptp_stop(struct i40e_pf *pf); int i40e_ptp_alloc_pins(struct i40e_pf *pf); +int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset); int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi); i40e_status i40e_get_partition_bw_setting(struct i40e_pf *pf); i40e_status i40e_set_partition_bw_setting(struct i40e_pf *pf); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 8437cc14bfc6..37386a270db5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1801,6 +1801,8 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; offset = 0; + /* zero out queue mapping, it will get updated on the end of the function */ + memset(ctxt->info.queue_mapping, 0, sizeof(ctxt->info.queue_mapping)); if (vsi->type == I40E_VSI_MAIN) { /* This code helps add more queue to the VSI if we have @@ -1817,10 +1819,12 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, } /* Number of queues per enabled TC */ - if (vsi->type == I40E_VSI_MAIN) + if (vsi->type == I40E_VSI_MAIN || + (vsi->type == I40E_VSI_SRIOV && vsi->num_queue_pairs != 0)) num_tc_qps = vsi->num_queue_pairs; else num_tc_qps = vsi->alloc_queue_pairs; + if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) { /* Find numtc from enabled TC bitmap */ for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) { @@ -1898,10 +1902,12 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi, } ctxt->info.tc_mapping[i] = cpu_to_le16(qmap); } - /* Do not change previously set num_queue_pairs for PFs */ + /* Do not change previously set num_queue_pairs for PFs and VFs*/ if ((vsi->type == I40E_VSI_MAIN && numtc != 1) || - vsi->type != I40E_VSI_MAIN) + (vsi->type == I40E_VSI_SRIOV && vsi->num_queue_pairs == 0) || + (vsi->type != I40E_VSI_MAIN && vsi->type != I40E_VSI_SRIOV)) vsi->num_queue_pairs = offset; + /* Scheduler section valid can only be set for ADD VSI */ if (is_add) { sections |= I40E_AQ_VSI_PROP_SCHED_VALID; @@ -5438,6 +5444,58 @@ static void i40e_vsi_update_queue_map(struct i40e_vsi *vsi, sizeof(vsi->info.tc_mapping)); } +/** + * i40e_update_adq_vsi_queues - update queue mapping for ADq VSI + * @vsi: the VSI being reconfigured + * @vsi_offset: offset from main VF VSI + */ +int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset) +{ + struct i40e_vsi_context ctxt = {}; + struct i40e_pf *pf; + struct i40e_hw *hw; + int ret; + + if (!vsi) + return I40E_ERR_PARAM; + pf = vsi->back; + hw = &pf->hw; + + ctxt.seid = vsi->seid; + ctxt.pf_num = hw->pf_id; + ctxt.vf_num = vsi->vf_id + hw->func_caps.vf_base_id + vsi_offset; + ctxt.uplink_seid = vsi->uplink_seid; + ctxt.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL; + ctxt.flags = I40E_AQ_VSI_TYPE_VF; + ctxt.info = vsi->info; + + i40e_vsi_setup_queue_map(vsi, &ctxt, vsi->tc_config.enabled_tc, + false); + if (vsi->reconfig_rss) { + vsi->rss_size = min_t(int, pf->alloc_rss_size, + vsi->num_queue_pairs); + ret = i40e_vsi_config_rss(vsi); + if (ret) { + dev_info(&pf->pdev->dev, "Failed to reconfig rss for num_queues\n"); + return ret; + } + vsi->reconfig_rss = false; + } + + ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL); + if (ret) { + dev_info(&pf->pdev->dev, "Update vsi config failed, err %s aq_err %s\n", + i40e_stat_str(hw, ret), + i40e_aq_str(hw, hw->aq.asq_last_status)); + return ret; + } + /* update the local VSI info with updated queue map */ + i40e_vsi_update_queue_map(vsi, &ctxt); + vsi->info.valid_sections = 0; + + return ret; +} + /** * i40e_vsi_config_tc - Configure VSI Tx Scheduler for given TC map * @vsi: VSI to be configured diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 815661632e7a..2102db11972a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2220,11 +2220,12 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) struct virtchnl_vsi_queue_config_info *qci = (struct virtchnl_vsi_queue_config_info *)msg; struct virtchnl_queue_pair_info *qpi; - struct i40e_pf *pf = vf->pf; u16 vsi_id, vsi_queue_id = 0; - u16 num_qps_all = 0; + struct i40e_pf *pf = vf->pf; i40e_status aq_ret = 0; int i, j = 0, idx = 0; + struct i40e_vsi *vsi; + u16 num_qps_all = 0; if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) { aq_ret = I40E_ERR_PARAM; @@ -2313,9 +2314,15 @@ static int i40e_vc_config_queues_msg(struct i40e_vf *vf, u8 *msg) pf->vsi[vf->lan_vsi_idx]->num_queue_pairs = qci->num_queue_pairs; } else { - for (i = 0; i < vf->num_tc; i++) - pf->vsi[vf->ch[i].vsi_idx]->num_queue_pairs = - vf->ch[i].num_qps; + for (i = 0; i < vf->num_tc; i++) { + vsi = pf->vsi[vf->ch[i].vsi_idx]; + vsi->num_queue_pairs = vf->ch[i].num_qps; + + if (i40e_update_adq_vsi_queues(vsi, i)) { + aq_ret = I40E_ERR_CONFIG; + goto error_param; + } + } } error_param: From 3b2b49e6dfdcf423506a771bf44cee842596351a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 17 Nov 2021 17:05:41 +0100 Subject: [PATCH 223/336] Revert "ACPI: scan: Release PM resources blocked by unused objects" Revert commit c10383e8ddf4 ("ACPI: scan: Release PM resources blocked by unused objects"), because it causes boot issues to appear on some platforms. Reported-by: Kyle D. Pelton Reported-by: Saranya Gopal Signed-off-by: Rafael J. Wysocki --- drivers/acpi/glue.c | 25 ------------------------- drivers/acpi/internal.h | 1 - drivers/acpi/scan.c | 6 ------ 3 files changed, 32 deletions(-) diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 7cd0009e7ff3..ef104809f27b 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -347,28 +347,3 @@ void acpi_device_notify_remove(struct device *dev) acpi_unbind_one(dev); } - -int acpi_dev_turn_off_if_unused(struct device *dev, void *not_used) -{ - struct acpi_device *adev = to_acpi_device(dev); - - /* - * Skip device objects with device IDs, because they may be in use even - * if they are not companions of any physical device objects. - */ - if (adev->pnp.type.hardware_id) - return 0; - - mutex_lock(&adev->physical_node_lock); - - /* - * Device objects without device IDs are not in use if they have no - * corresponding physical device objects. - */ - if (list_empty(&adev->physical_node_list)) - acpi_device_set_power(adev, ACPI_STATE_D3_COLD); - - mutex_unlock(&adev->physical_node_lock); - - return 0; -} diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 8fbdc172864b..d91b560e8867 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -117,7 +117,6 @@ bool acpi_device_is_battery(struct acpi_device *adev); bool acpi_device_is_first_physical_node(struct acpi_device *adev, const struct device *dev); int acpi_bus_register_early_device(int type); -int acpi_dev_turn_off_if_unused(struct device *dev, void *not_used); /* -------------------------------------------------------------------------- Device Matching and Notification diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index a50f1967c73d..2c80765670bc 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2564,12 +2564,6 @@ int __init acpi_scan_init(void) } } - /* - * Make sure that power management resources are not blocked by ACPI - * device objects with no users. - */ - bus_for_each_dev(&acpi_bus_type, NULL, NULL, acpi_dev_turn_off_if_unused); - acpi_turn_off_unused_power_resources(); acpi_scan_initialized = true; From 378c67413de18b69fb3bb78d8c4f0f1192cfa973 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 15 Nov 2021 11:15:19 +0100 Subject: [PATCH 224/336] RDMA/mlx4: Do not fail the registration on port stats If the FW doesn't support MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT, mlx4 driver will fail the ib_setup_port_attrs, which is called from ib_register_device()/enable_device_and_get(), in the end leads to device not detected[1][2] To fix it, add a new mlx4_ib_hw_stats_ops1, w/o alloc_hw_port_stats if FW does not support MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT. [1] https://bugzilla.redhat.com/show_bug.cgi?id=2014094 [2] https://lore.kernel.org/linux-rdma/CAMGffEn2wvEnmzc0xe=xYiCLqpphiHDBxCxqAELrBofbUAMQxw@mail.gmail.com Fixes: 4b5f4d3fb408 ("RDMA: Split the alloc_hw_stats() ops to port and device variants") Link: https://lore.kernel.org/r/20211115101519.27210-1-jinpu.wang@ionos.com Signed-off-by: Jack Wang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ceca05982f61..0d2fa3338784 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2215,6 +2215,11 @@ static const struct ib_device_ops mlx4_ib_hw_stats_ops = { .get_hw_stats = mlx4_ib_get_hw_stats, }; +static const struct ib_device_ops mlx4_ib_hw_stats_ops1 = { + .alloc_hw_device_stats = mlx4_ib_alloc_hw_device_stats, + .get_hw_stats = mlx4_ib_get_hw_stats, +}; + static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev) { struct mlx4_ib_diag_counters *diag = ibdev->diag_counters; @@ -2227,9 +2232,16 @@ static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev) return 0; for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) { - /* i == 1 means we are building port counters */ - if (i && !per_port) - continue; + /* + * i == 1 means we are building port counters, set a different + * stats ops without port stats callback. + */ + if (i && !per_port) { + ib_set_device_ops(&ibdev->ib_dev, + &mlx4_ib_hw_stats_ops1); + + return 0; + } ret = __mlx4_ib_alloc_diag_counters(ibdev, &diag[i].descs, &diag[i].offset, From d821f7c13ca03318ad1bdc64ce64afb43080a07a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Nov 2021 14:27:04 +0200 Subject: [PATCH 225/336] RDMA/nldev: Check stat attribute before accessing it The access to non-existent netlink attribute causes to the following kernel panic. Fix it by checking existence before trying to read it. general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 0 PID: 6744 Comm: syz-executor.0 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:nla_get_u32 include/net/netlink.h:1554 [inline] RIP: 0010:nldev_stat_set_mode_doit drivers/infiniband/core/nldev.c:1909 [inline] RIP: 0010:nldev_stat_set_doit+0x578/0x10d0 drivers/infiniband/core/nldev.c:2040 Code: fa 4c 8b a4 24 f8 02 00 00 48 b8 00 00 00 00 00 fc ff df c7 84 24 80 00 00 00 00 00 00 00 49 8d 7c 24 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 02 RSP: 0018:ffffc90004acf2e8 EFLAGS: 00010247 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffc90002b94000 RDX: 0000000000000000 RSI: ffffffff8684c5ff RDI: 0000000000000004 RBP: ffff88807cda4000 R08: 0000000000000000 R09: ffff888023fb8027 R10: ffffffff8684c5d7 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000001 R14: ffff888041024280 R15: ffff888031ade780 FS: 00007eff9dddd700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2ef24000 CR3: 0000000036902000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rdma_nl_rcv_msg+0x36d/0x690 drivers/infiniband/core/netlink.c:195 rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline] rdma_nl_rcv+0x2ee/0x430 drivers/infiniband/core/netlink.c:259 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x86d/0xda0 net/netlink/af_netlink.c:1916 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2409 ___sys_sendmsg+0xf3/0x170 net/socket.c:2463 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2492 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 822cf785ac6d ("RDMA/nldev: Split nldev_stat_set_mode_doit out of nldev_stat_set_doit") Link: https://lore.kernel.org/r/b21967c366f076ff1988862f9c8a1aa0244c599f.1637151999.git.leonro@nvidia.com Reported-by: syzbot+9111d2255a9710e87562@syzkaller.appspotmail.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index fedc0fa6ebf9..f5aacaf7fb8e 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1906,7 +1906,8 @@ static int nldev_stat_set_mode_doit(struct sk_buff *msg, int ret; /* Currently only counter for QP is supported */ - if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + if (!tb[RDMA_NLDEV_ATTR_STAT_RES] || + nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) return -EINVAL; mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); From 38a268b39182bfe694806e03974326270c1f170f Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Wed, 10 Nov 2021 14:58:55 -0500 Subject: [PATCH 226/336] drm/amd/pm: Enhanced reporting also for a stuck command Also print the message index and parameter of the stuck command. Cc: Alex Deucher Signed-off-by: Luben Tuikov Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index 843d2cbfc71d..ea6f50c08c5f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -139,9 +139,13 @@ static void __smu_cmn_reg_print_error(struct smu_context *smu, const char *message = smu_get_message_name(smu, msg); switch (reg_c2pmsg_90) { - case SMU_RESP_NONE: + case SMU_RESP_NONE: { + u32 msg_idx = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66); + u32 prm = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82); dev_err_ratelimited(adev->dev, - "SMU: I'm not done with your previous command!"); + "SMU: I'm not done with your previous command: SMN_C2PMSG_66:0x%08X SMN_C2PMSG_82:0x%08X", + msg_idx, prm); + } break; case SMU_RESP_OK: /* The SMU executed the command. It completed with a From 69650a879b93e7e445e7a833287701ea7f32bd3a Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Thu, 11 Nov 2021 11:15:08 +0800 Subject: [PATCH 227/336] drm/amdgpu: add error print when failing to add IP block(v2) Driver initialization is driven by IP version from IP discovery table. So add error print when failing to add ip block during driver initialization, this will be more friendly to user to know which IP version is not correct. [ 40.467361] [drm] host supports REQ_INIT_DATA handshake [ 40.474076] [drm] add ip block number 0 [ 40.474090] [drm] add ip block number 1 [ 40.474101] [drm] add ip block number 2 [ 40.474103] [drm] add ip block number 3 [ 40.474114] [drm] add ip block number 4 [ 40.474119] [drm] add ip block number 5 [ 40.474134] [drm] add ip block number 6 [ 40.474143] [drm] add ip block number 7 [ 40.474147] amdgpu 0000:00:08.0: amdgpu: Fatal error during GPU init [ 40.474545] amdgpu 0000:00:08.0: amdgpu: amdgpu: finishing device. v2: use dev_err to multi-GPU system Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index ff70bc233489..4e3669407518 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -587,6 +587,9 @@ static int amdgpu_discovery_set_common_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &nv_common_ip_block); break; default: + dev_err(adev->dev, + "Failed to add common ip block(GC_HWIP:0x%x)\n", + adev->ip_versions[GC_HWIP][0]); return -EINVAL; } return 0; @@ -619,6 +622,9 @@ static int amdgpu_discovery_set_gmc_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &gmc_v10_0_ip_block); break; default: + dev_err(adev->dev, + "Failed to add gmc ip block(GC_HWIP:0x%x)\n", + adev->ip_versions[GC_HWIP][0]); return -EINVAL; } return 0; @@ -648,6 +654,9 @@ static int amdgpu_discovery_set_ih_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &navi10_ih_ip_block); break; default: + dev_err(adev->dev, + "Failed to add ih ip block(OSSSYS_HWIP:0x%x)\n", + adev->ip_versions[OSSSYS_HWIP][0]); return -EINVAL; } return 0; @@ -688,6 +697,9 @@ static int amdgpu_discovery_set_psp_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &psp_v13_0_ip_block); break; default: + dev_err(adev->dev, + "Failed to add psp ip block(MP0_HWIP:0x%x)\n", + adev->ip_versions[MP0_HWIP][0]); return -EINVAL; } return 0; @@ -726,6 +738,9 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &smu_v13_0_ip_block); break; default: + dev_err(adev->dev, + "Failed to add smu ip block(MP1_HWIP:0x%x)\n", + adev->ip_versions[MP1_HWIP][0]); return -EINVAL; } return 0; @@ -753,6 +768,9 @@ static int amdgpu_discovery_set_display_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &dm_ip_block); break; default: + dev_err(adev->dev, + "Failed to add dm ip block(DCE_HWIP:0x%x)\n", + adev->ip_versions[DCE_HWIP][0]); return -EINVAL; } } else if (adev->ip_versions[DCI_HWIP][0]) { @@ -763,6 +781,9 @@ static int amdgpu_discovery_set_display_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &dm_ip_block); break; default: + dev_err(adev->dev, + "Failed to add dm ip block(DCI_HWIP:0x%x)\n", + adev->ip_versions[DCI_HWIP][0]); return -EINVAL; } #endif @@ -796,6 +817,9 @@ static int amdgpu_discovery_set_gc_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &gfx_v10_0_ip_block); break; default: + dev_err(adev->dev, + "Failed to add gfx ip block(GC_HWIP:0x%x)\n", + adev->ip_versions[GC_HWIP][0]); return -EINVAL; } return 0; @@ -829,6 +853,9 @@ static int amdgpu_discovery_set_sdma_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &sdma_v5_2_ip_block); break; default: + dev_err(adev->dev, + "Failed to add sdma ip block(SDMA0_HWIP:0x%x)\n", + adev->ip_versions[SDMA0_HWIP][0]); return -EINVAL; } return 0; @@ -845,6 +872,9 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &uvd_v7_0_ip_block); break; default: + dev_err(adev->dev, + "Failed to add uvd v7 ip block(UVD_HWIP:0x%x)\n", + adev->ip_versions[UVD_HWIP][0]); return -EINVAL; } switch (adev->ip_versions[VCE_HWIP][0]) { @@ -855,6 +885,9 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &vce_v4_0_ip_block); break; default: + dev_err(adev->dev, + "Failed to add VCE v4 ip block(VCE_HWIP:0x%x)\n", + adev->ip_versions[VCE_HWIP][0]); return -EINVAL; } } else { @@ -893,6 +926,9 @@ static int amdgpu_discovery_set_mm_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, &vcn_v3_0_ip_block); break; default: + dev_err(adev->dev, + "Failed to add vcn/jpeg ip block(UVD_HWIP:0x%x)\n", + adev->ip_versions[UVD_HWIP][0]); return -EINVAL; } } From 6ee27ee27ba8b2e725886951ba2d2d87f113bece Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 5 Nov 2021 15:25:30 +0800 Subject: [PATCH 228/336] drm/amd/pm: avoid duplicate powergate/ungate setting Just bail out if the target IP block is already in the desired powergate/ungate state. This can avoid some duplicate settings which sometimes may cause unexpected issues. Link: https://lore.kernel.org/all/YV81vidWQLWvATMM@zn.tnic/ Bug: https://bugzilla.kernel.org/show_bug.cgi?id=214921 Bug: https://bugzilla.kernel.org/show_bug.cgi?id=215025 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1789 Fixes: bf756fb833cb ("drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend") Signed-off-by: Evan Quan Tested-by: Borislav Petkov Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ drivers/gpu/drm/amd/include/amd_shared.h | 3 ++- drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 10 ++++++++++ drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 8 ++++++++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5625f7736e37..188accb71249 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3509,6 +3509,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->rmmio_size = pci_resource_len(adev->pdev, 2); } + for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) + atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); + adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); if (adev->rmmio == NULL) { return -ENOMEM; diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h index f1a46d16f7ea..4b9e68a79f06 100644 --- a/drivers/gpu/drm/amd/include/amd_shared.h +++ b/drivers/gpu/drm/amd/include/amd_shared.h @@ -98,7 +98,8 @@ enum amd_ip_block_type { AMD_IP_BLOCK_TYPE_ACP, AMD_IP_BLOCK_TYPE_VCN, AMD_IP_BLOCK_TYPE_MES, - AMD_IP_BLOCK_TYPE_JPEG + AMD_IP_BLOCK_TYPE_JPEG, + AMD_IP_BLOCK_TYPE_NUM, }; enum amd_clockgating_state { diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c index 03581d5b1836..08362d506534 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c @@ -927,6 +927,13 @@ int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, uint32_t block { int ret = 0; const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; + enum ip_power_state pwr_state = gate ? POWER_STATE_OFF : POWER_STATE_ON; + + if (atomic_read(&adev->pm.pwr_state[block_type]) == pwr_state) { + dev_dbg(adev->dev, "IP block%d already in the target %s state!", + block_type, gate ? "gate" : "ungate"); + return 0; + } switch (block_type) { case AMD_IP_BLOCK_TYPE_UVD: @@ -979,6 +986,9 @@ int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, uint32_t block break; } + if (!ret) + atomic_set(&adev->pm.pwr_state[block_type], pwr_state); + return ret; } diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h index 98f1b3d8c1d5..16e3f72d31b9 100644 --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h @@ -417,6 +417,12 @@ struct amdgpu_dpm { enum amd_dpm_forced_level forced_level; }; +enum ip_power_state { + POWER_STATE_UNKNOWN, + POWER_STATE_ON, + POWER_STATE_OFF, +}; + struct amdgpu_pm { struct mutex mutex; u32 current_sclk; @@ -452,6 +458,8 @@ struct amdgpu_pm { struct i2c_adapter smu_i2c; struct mutex smu_i2c_mutex; struct list_head pm_attr_list; + + atomic_t pwr_state[AMD_IP_BLOCK_TYPE_NUM]; }; #define R600_SSTU_DFLT 0 From be83a5676767c99c2417083c29d42aa1e109a69d Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Mon, 15 Nov 2021 15:23:27 +0800 Subject: [PATCH 229/336] drm/amd/pm: Remove artificial freq level on Navi1x Print Navi1x fine grained clocks in a consistent manner with other SOCs. Don't show aritificial DPM level when the current clock equals min or max. Signed-off-by: Lijo Lazar Reviewed-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index 71161f6b78fe..60a557068ea4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -1265,7 +1265,7 @@ static int navi10_print_clk_levels(struct smu_context *smu, enum smu_clk_type clk_type, char *buf) { uint16_t *curve_settings; - int i, size = 0, ret = 0; + int i, levels, size = 0, ret = 0; uint32_t cur_value = 0, value = 0, count = 0; uint32_t freq_values[3] = {0}; uint32_t mark_index = 0; @@ -1319,14 +1319,17 @@ static int navi10_print_clk_levels(struct smu_context *smu, freq_values[1] = cur_value; mark_index = cur_value == freq_values[0] ? 0 : cur_value == freq_values[2] ? 2 : 1; - if (mark_index != 1) - freq_values[1] = (freq_values[0] + freq_values[2]) / 2; - for (i = 0; i < 3; i++) { + levels = 3; + if (mark_index != 1) { + levels = 2; + freq_values[1] = freq_values[2]; + } + + for (i = 0; i < levels; i++) { size += sysfs_emit_at(buf, size, "%d: %uMhz %s\n", i, freq_values[i], i == mark_index ? "*" : ""); } - } break; case SMU_PCIE: From 3a3b311e3881172fc8e019b6508f04bc40c92d9d Mon Sep 17 00:00:00 2001 From: Karen Sornek Date: Wed, 28 Apr 2021 10:19:41 +0200 Subject: [PATCH 230/336] i40e: Fix warning message and call stack during rmmod i40e driver Restore part of reset functionality used when reset is called from the VF to reset itself. Without this fix warning message is displayed when VF is being removed via sysfs. Fix the crash of the VF during reset by ensuring that the PF receives the reset message successfully. Refactor code to use one function instead of two. Fixes: 5c3c48ac6bf5 ("i40e: implement virtual device interface") Signed-off-by: Grzegorz Szczurek Signed-off-by: Karen Sornek Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 53 ++++++++----------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 2102db11972a..80ae264c99ba 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -183,17 +183,18 @@ void i40e_vc_notify_vf_reset(struct i40e_vf *vf) /***********************misc routines*****************************/ /** - * i40e_vc_disable_vf + * i40e_vc_reset_vf * @vf: pointer to the VF info - * - * Disable the VF through a SW reset. + * @notify_vf: notify vf about reset or not + * Reset VF handler. **/ -static inline void i40e_vc_disable_vf(struct i40e_vf *vf) +static void i40e_vc_reset_vf(struct i40e_vf *vf, bool notify_vf) { struct i40e_pf *pf = vf->pf; int i; - i40e_vc_notify_vf_reset(vf); + if (notify_vf) + i40e_vc_notify_vf_reset(vf); /* We want to ensure that an actual reset occurs initiated after this * function was called. However, we do not want to wait forever, so @@ -211,9 +212,14 @@ static inline void i40e_vc_disable_vf(struct i40e_vf *vf) usleep_range(10000, 20000); } - dev_warn(&vf->pf->pdev->dev, - "Failed to initiate reset for VF %d after 200 milliseconds\n", - vf->vf_id); + if (notify_vf) + dev_warn(&vf->pf->pdev->dev, + "Failed to initiate reset for VF %d after 200 milliseconds\n", + vf->vf_id); + else + dev_dbg(&vf->pf->pdev->dev, + "Failed to initiate reset for VF %d after 200 milliseconds\n", + vf->vf_id); } /** @@ -2108,20 +2114,6 @@ err: return ret; } -/** - * i40e_vc_reset_vf_msg - * @vf: pointer to the VF info - * - * called from the VF to reset itself, - * unlike other virtchnl messages, PF driver - * doesn't send the response back to the VF - **/ -static void i40e_vc_reset_vf_msg(struct i40e_vf *vf) -{ - if (test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) - i40e_reset_vf(vf, false); -} - /** * i40e_vc_config_promiscuous_mode_msg * @vf: pointer to the VF info @@ -2617,8 +2609,7 @@ static int i40e_vc_request_queues_msg(struct i40e_vf *vf, u8 *msg) } else { /* successful request */ vf->num_req_queues = req_pairs; - i40e_vc_notify_vf_reset(vf); - i40e_reset_vf(vf, false); + i40e_vc_reset_vf(vf, true); return 0; } @@ -3813,8 +3804,7 @@ static int i40e_vc_add_qch_msg(struct i40e_vf *vf, u8 *msg) vf->num_req_queues = 0; /* reset the VF in order to allocate resources */ - i40e_vc_notify_vf_reset(vf); - i40e_reset_vf(vf, false); + i40e_vc_reset_vf(vf, true); return I40E_SUCCESS; @@ -3854,8 +3844,7 @@ static int i40e_vc_del_qch_msg(struct i40e_vf *vf, u8 *msg) } /* reset the VF in order to allocate resources */ - i40e_vc_notify_vf_reset(vf); - i40e_reset_vf(vf, false); + i40e_vc_reset_vf(vf, true); return I40E_SUCCESS; @@ -3917,7 +3906,7 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode, i40e_vc_notify_vf_link_state(vf); break; case VIRTCHNL_OP_RESET_VF: - i40e_vc_reset_vf_msg(vf); + i40e_vc_reset_vf(vf, false); ret = 0; break; case VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE: @@ -4171,7 +4160,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) /* Force the VF interface down so it has to bring up with new MAC * address */ - i40e_vc_disable_vf(vf); + i40e_vc_reset_vf(vf, true); dev_info(&pf->pdev->dev, "Bring down and up the VF interface to make this change effective.\n"); error_param: @@ -4235,7 +4224,7 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, int vf_id, /* duplicate request, so just return success */ goto error_pvid; - i40e_vc_disable_vf(vf); + i40e_vc_reset_vf(vf, true); /* During reset the VF got a new VSI, so refresh a pointer. */ vsi = pf->vsi[vf->lan_vsi_idx]; /* Locked once because multiple functions below iterate list */ @@ -4613,7 +4602,7 @@ int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting) goto out; vf->trusted = setting; - i40e_vc_disable_vf(vf); + i40e_vc_reset_vf(vf, true); dev_info(&pf->pdev->dev, "VF %u is now %strusted\n", vf_id, setting ? "" : "un"); From 2e6d218c1ec6fb9cd70693b78134cbc35ae0b5a9 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 21 Jun 2021 08:37:31 +0000 Subject: [PATCH 231/336] i40e: Fix creation of first queue by omitting it if is not power of two Reject TCs creation with proper message if the first queue assignment is not equal to the power of two. The first queue number was checked too late in the second queue iteration, if second queue was configured at all. Now if first queue value is not a power of two, then trying to create qdisc will be rejected. Fixes: 8f88b3034db3 ("i40e: Add infrastructure for queue channel support") Signed-off-by: Grzegorz Szczurek Signed-off-by: Jedrzej Jagielski Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 59 +++++++-------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 37386a270db5..0a98fab6d019 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -5786,24 +5786,6 @@ static void i40e_remove_queue_channels(struct i40e_vsi *vsi) INIT_LIST_HEAD(&vsi->ch_list); } -/** - * i40e_is_any_channel - channel exist or not - * @vsi: ptr to VSI to which channels are associated with - * - * Returns true or false if channel(s) exist for associated VSI or not - **/ -static bool i40e_is_any_channel(struct i40e_vsi *vsi) -{ - struct i40e_channel *ch, *ch_tmp; - - list_for_each_entry_safe(ch, ch_tmp, &vsi->ch_list, list) { - if (ch->initialized) - return true; - } - - return false; -} - /** * i40e_get_max_queues_for_channel * @vsi: ptr to VSI to which channels are associated with @@ -6310,26 +6292,15 @@ int i40e_create_queue_channel(struct i40e_vsi *vsi, /* By default we are in VEPA mode, if this is the first VF/VMDq * VSI to be added switch to VEB mode. */ - if ((!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) || - (!i40e_is_any_channel(vsi))) { - if (!is_power_of_2(vsi->tc_config.tc_info[0].qcount)) { - dev_dbg(&pf->pdev->dev, - "Failed to create channel. Override queues (%u) not power of 2\n", - vsi->tc_config.tc_info[0].qcount); - return -EINVAL; - } - if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) { - pf->flags |= I40E_FLAG_VEB_MODE_ENABLED; + if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) { + pf->flags |= I40E_FLAG_VEB_MODE_ENABLED; - if (vsi->type == I40E_VSI_MAIN) { - if (pf->flags & I40E_FLAG_TC_MQPRIO) - i40e_do_reset(pf, I40E_PF_RESET_FLAG, - true); - else - i40e_do_reset_safe(pf, - I40E_PF_RESET_FLAG); - } + if (vsi->type == I40E_VSI_MAIN) { + if (pf->flags & I40E_FLAG_TC_MQPRIO) + i40e_do_reset(pf, I40E_PF_RESET_FLAG, true); + else + i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG); } /* now onwards for main VSI, number of queues will be value * of TC0's queue count @@ -7982,12 +7953,20 @@ config_tc: vsi->seid); need_reset = true; goto exit; - } else { - dev_info(&vsi->back->pdev->dev, - "Setup channel (id:%u) utilizing num_queues %d\n", - vsi->seid, vsi->tc_config.tc_info[0].qcount); + } else if (enabled_tc && + (!is_power_of_2(vsi->tc_config.tc_info[0].qcount))) { + netdev_info(netdev, + "Failed to create channel. Override queues (%u) not power of 2\n", + vsi->tc_config.tc_info[0].qcount); + ret = -EINVAL; + need_reset = true; + goto exit; } + dev_info(&vsi->back->pdev->dev, + "Setup channel (id:%u) utilizing num_queues %d\n", + vsi->seid, vsi->tc_config.tc_info[0].qcount); + if (pf->flags & I40E_FLAG_TC_MQPRIO) { if (vsi->mqprio_qopt.max_rate[0]) { u64 max_tx_rate = vsi->mqprio_qopt.max_rate[0]; From 5aff430d4e33a0b48a6b3d5beb06f79da23f9916 Mon Sep 17 00:00:00 2001 From: Grzegorz Szczurek Date: Fri, 29 Oct 2021 11:26:01 +0200 Subject: [PATCH 232/336] i40e: Fix display error code in dmesg Fix misleading display error in dmesg if tc filter return fail. Only i40e status error code should be converted to string, not linux error code. Otherwise, we return false information about the error. Fixes: 2f4b411a3d67 ("i40e: Enable cloud filters via tc-flower") Signed-off-by: Grzegorz Szczurek Signed-off-by: Mateusz Palczewski Tested-by: Dave Switzer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 0a98fab6d019..e118cf9265c7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8531,9 +8531,8 @@ static int i40e_configure_clsflower(struct i40e_vsi *vsi, err = i40e_add_del_cloud_filter(vsi, filter, true); if (err) { - dev_err(&pf->pdev->dev, - "Failed to add cloud filter, err %s\n", - i40e_stat_str(&pf->hw, err)); + dev_err(&pf->pdev->dev, "Failed to add cloud filter, err %d\n", + err); goto err; } From dab60582685aabdae2d4ff7ce716456bd0dc7a0f Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 17 Nov 2021 10:05:36 -0500 Subject: [PATCH 233/336] drm/amd/display: Fix OLED brightness control on eDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] After commit ("drm/amdgpu/display: add support for multiple backlights") number of eDPs is defined while registering backlight device. However the panel's extended caps get updated once before register call. That leads to regression with extended caps like oled brightness control. [How] Update connector ext caps after register_backlight_device Fixes: 7fd13baeb7a3a4 ("drm/amdgpu/display: add support for multiple backlights") Link: https://www.reddit.com/r/AMDLaptops/comments/qst0fm/after_updating_to_linux_515_my_brightness/ Signed-off-by: Roman Li Tested-by: Samuel Čavoj Acked-by: Alex Deucher Reviewed-by: Jasdeep Dhillon Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index c911b30de658..c27cb47bc988 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4242,7 +4242,8 @@ static int amdgpu_dm_initialize_drm_device(struct amdgpu_device *adev) } else if (dc_link_detect(link, DETECT_REASON_BOOT)) { amdgpu_dm_update_connector_after_detect(aconnector); register_backlight_device(dm, link); - + if (dm->num_of_edps) + update_connector_ext_caps(aconnector); if (psr_feature_enabled) amdgpu_dm_set_psr_caps(link); } From bf552083916a7f8800477b5986940d1c9a31b953 Mon Sep 17 00:00:00 2001 From: hongao Date: Thu, 11 Nov 2021 11:32:07 +0800 Subject: [PATCH 234/336] drm/amdgpu: fix set scaling mode Full/Full aspect/Center not works on vga and dvi connectors amdgpu_connector_vga_get_modes missed function amdgpu_get_native_mode which assign amdgpu_encoder->native_mode with *preferred_mode result in amdgpu_encoder->native_mode.clock always be 0. That will cause amdgpu_connector_set_property returned early on: if ((rmx_type != DRM_MODE_SCALE_NONE) && (amdgpu_encoder->native_mode.clock == 0)) when we try to set scaling mode Full/Full aspect/Center. Add the missing function to amdgpu_connector_vga_get_mode can fix this. It also works on dvi connectors because amdgpu_connector_dvi_helper_funcs.get_mode use the same method. Signed-off-by: hongao Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c index b9c11c2b2885..0de66f59adb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c @@ -827,6 +827,7 @@ static int amdgpu_connector_vga_get_modes(struct drm_connector *connector) amdgpu_connector_get_edid(connector); ret = amdgpu_connector_ddc_get_modes(connector); + amdgpu_get_native_mode(connector); return ret; } From 3dac776e349a214c07fb2b0e5973947b0aade4f6 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Thu, 28 Oct 2021 06:05:42 -0400 Subject: [PATCH 235/336] drm/amd/pm: add GFXCLK/SCLK clocks level print support for APUs add support that allow the userspace tool like RGP to get the GFX clock value at runtime, the fix follow the old way to show the min/current/max clocks level for compatible consideration. === Test === $ cat /sys/class/drm/card0/device/pp_dpm_sclk 0: 200Mhz * 1: 1100Mhz 2: 1600Mhz then run stress test on one APU system. $ cat /sys/class/drm/card0/device/pp_dpm_sclk 0: 200Mhz 1: 1040Mhz * 2: 1600Mhz The current GFXCLK value is updated at runtime. BugLink: https://gitlab.freedesktop.org/mesa/mesa/-/issues/5260 Reviewed-by: Huang Ray Signed-off-by: Perry Yuan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../amd/pm/swsmu/smu11/cyan_skillfish_ppt.c | 22 +++++++++++++-- .../gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c | 26 ++++++++++++++++++ .../drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c | 27 +++++++++++++++++++ .../drm/amd/pm/swsmu/smu13/yellow_carp_ppt.h | 1 + 4 files changed, 74 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c index cbc3f99e8573..2238ee19c222 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c @@ -309,6 +309,7 @@ static int cyan_skillfish_print_clk_levels(struct smu_context *smu, { int ret = 0, size = 0; uint32_t cur_value = 0; + int i; smu_cmn_get_sysfs_buf(&buf, &size); @@ -334,8 +335,6 @@ static int cyan_skillfish_print_clk_levels(struct smu_context *smu, size += sysfs_emit_at(buf, size, "VDDC: %7umV %10umV\n", CYAN_SKILLFISH_VDDC_MIN, CYAN_SKILLFISH_VDDC_MAX); break; - case SMU_GFXCLK: - case SMU_SCLK: case SMU_FCLK: case SMU_MCLK: case SMU_SOCCLK: @@ -346,6 +345,25 @@ static int cyan_skillfish_print_clk_levels(struct smu_context *smu, return ret; size += sysfs_emit_at(buf, size, "0: %uMhz *\n", cur_value); break; + case SMU_SCLK: + case SMU_GFXCLK: + ret = cyan_skillfish_get_current_clk_freq(smu, clk_type, &cur_value); + if (ret) + return ret; + if (cur_value == CYAN_SKILLFISH_SCLK_MAX) + i = 2; + else if (cur_value == CYAN_SKILLFISH_SCLK_MIN) + i = 0; + else + i = 1; + size += sysfs_emit_at(buf, size, "0: %uMhz %s\n", CYAN_SKILLFISH_SCLK_MIN, + i == 0 ? "*" : ""); + size += sysfs_emit_at(buf, size, "1: %uMhz %s\n", + i == 1 ? cur_value : cyan_skillfish_sclk_default, + i == 1 ? "*" : ""); + size += sysfs_emit_at(buf, size, "2: %uMhz %s\n", CYAN_SKILLFISH_SCLK_MAX, + i == 2 ? "*" : ""); + break; default: dev_warn(smu->adev->dev, "Unsupported clock type\n"); return ret; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c index 421f38e8dada..c02ed65ffa38 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c @@ -683,6 +683,7 @@ static int vangogh_print_clk_levels(struct smu_context *smu, int i, size = 0, ret = 0; uint32_t cur_value = 0, value = 0, count = 0; bool cur_value_match_level = false; + uint32_t min, max; memset(&metrics, 0, sizeof(metrics)); @@ -743,6 +744,13 @@ static int vangogh_print_clk_levels(struct smu_context *smu, if (ret) return ret; break; + case SMU_GFXCLK: + case SMU_SCLK: + ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetGfxclkFrequency, 0, &cur_value); + if (ret) { + return ret; + } + break; default: break; } @@ -768,6 +776,24 @@ static int vangogh_print_clk_levels(struct smu_context *smu, if (!cur_value_match_level) size += sysfs_emit_at(buf, size, " %uMhz *\n", cur_value); break; + case SMU_GFXCLK: + case SMU_SCLK: + min = (smu->gfx_actual_hard_min_freq > 0) ? smu->gfx_actual_hard_min_freq : smu->gfx_default_hard_min_freq; + max = (smu->gfx_actual_soft_max_freq > 0) ? smu->gfx_actual_soft_max_freq : smu->gfx_default_soft_max_freq; + if (cur_value == max) + i = 2; + else if (cur_value == min) + i = 0; + else + i = 1; + size += sysfs_emit_at(buf, size, "0: %uMhz %s\n", min, + i == 0 ? "*" : ""); + size += sysfs_emit_at(buf, size, "1: %uMhz %s\n", + i == 1 ? cur_value : VANGOGH_UMD_PSTATE_STANDARD_GFXCLK, + i == 1 ? "*" : ""); + size += sysfs_emit_at(buf, size, "2: %uMhz %s\n", max, + i == 2 ? "*" : ""); + break; default: break; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c index 8215bbf5ed7c..caf1775d48ef 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.c @@ -697,6 +697,11 @@ static int yellow_carp_get_current_clk_freq(struct smu_context *smu, case SMU_FCLK: return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_GetFclkFrequency, 0, value); + case SMU_GFXCLK: + case SMU_SCLK: + return smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_GetGfxclkFrequency, 0, value); + break; default: return -EINVAL; } @@ -967,6 +972,7 @@ static int yellow_carp_print_clk_levels(struct smu_context *smu, { int i, size = 0, ret = 0; uint32_t cur_value = 0, value = 0, count = 0; + uint32_t min, max; smu_cmn_get_sysfs_buf(&buf, &size); @@ -1005,6 +1011,27 @@ static int yellow_carp_print_clk_levels(struct smu_context *smu, cur_value == value ? "*" : ""); } break; + case SMU_GFXCLK: + case SMU_SCLK: + ret = yellow_carp_get_current_clk_freq(smu, clk_type, &cur_value); + if (ret) + goto print_clk_out; + min = (smu->gfx_actual_hard_min_freq > 0) ? smu->gfx_actual_hard_min_freq : smu->gfx_default_hard_min_freq; + max = (smu->gfx_actual_soft_max_freq > 0) ? smu->gfx_actual_soft_max_freq : smu->gfx_default_soft_max_freq; + if (cur_value == max) + i = 2; + else if (cur_value == min) + i = 0; + else + i = 1; + size += sysfs_emit_at(buf, size, "0: %uMhz %s\n", min, + i == 0 ? "*" : ""); + size += sysfs_emit_at(buf, size, "1: %uMhz %s\n", + i == 1 ? cur_value : YELLOW_CARP_UMD_PSTATE_GFXCLK, + i == 1 ? "*" : ""); + size += sysfs_emit_at(buf, size, "2: %uMhz %s\n", max, + i == 2 ? "*" : ""); + break; default: break; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.h index b3ad8352c68a..a9205a8ea3ad 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/yellow_carp_ppt.h @@ -24,5 +24,6 @@ #define __YELLOW_CARP_PPT_H__ extern void yellow_carp_set_ppt_funcs(struct smu_context *smu); +#define YELLOW_CARP_UMD_PSTATE_GFXCLK 1100 #endif From 3e6db079751afd527bf3db32314ae938dc571916 Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Mon, 15 Nov 2021 08:01:43 -0800 Subject: [PATCH 236/336] tipc: check for null after calling kmemdup kmemdup can return a null pointer so need to check for it, otherwise the null key will be dereferenced later in tipc_crypto_key_xmit as can be seen in the trace [1]. Cc: tipc-discussion@lists.sourceforge.net Cc: stable@vger.kernel.org # 5.15, 5.14, 5.10 [1] https://syzkaller.appspot.com/bug?id=bca180abb29567b189efdbdb34cbf7ba851c2a58 Reported-by: Dmitry Vyukov Signed-off-by: Tadeusz Struk Acked-by: Ying Xue Acked-by: Jon Maloy Link: https://lore.kernel.org/r/20211115160143.5099-1-tadeusz.struk@linaro.org Signed-off-by: Jakub Kicinski --- net/tipc/crypto.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index e701651f6533..b4d9419a015b 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -597,6 +597,10 @@ static int tipc_aead_init(struct tipc_aead **aead, struct tipc_aead_key *ukey, tmp->cloned = NULL; tmp->authsize = TIPC_AES_GCM_TAG_SIZE; tmp->key = kmemdup(ukey, tipc_aead_key_size(ukey), GFP_KERNEL); + if (!tmp->key) { + tipc_aead_free(&tmp->rcu); + return -ENOMEM; + } memcpy(&tmp->salt, ukey->key + keylen, TIPC_AES_GCM_SALT_SIZE); atomic_set(&tmp->users, 0); atomic64_set(&tmp->seqno, 0); From 2cf49e00d40d5132e3d067b5aa6d84791929ab15 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Sun, 14 Nov 2021 12:38:18 -0500 Subject: [PATCH 237/336] drm/amd/amdkfd: Fix kernel panic when reset failed and been triggered again In SRIOV configuration, the reset may failed to bring asic back to normal but stop cpsch already been called, the start_cpsch will not be called since there is no resume in this case. When reset been triggered again, driver should avoid to do uninitialization again. Signed-off-by: shaoyunl Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 003ba6a373ff..93e33dd84dd4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1226,6 +1226,11 @@ static int stop_cpsch(struct device_queue_manager *dqm) bool hanging; dqm_lock(dqm); + if (!dqm->sched_running) { + dqm_unlock(dqm); + return 0; + } + if (!dqm->is_hws_hang) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; From 27dfaedc0d321b4ea4e10c53e4679d6911ab17aa Mon Sep 17 00:00:00 2001 From: Bernard Zhao Date: Sun, 14 Nov 2021 18:58:50 -0800 Subject: [PATCH 238/336] drm/amd/amdgpu: fix potential memleak In function amdgpu_get_xgmi_hive, when kobject_init_and_add failed There is a potential memleak if not call kobject_put. Reviewed-by: Felix Kuehling Signed-off-by: Bernard Zhao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 0fad2bf854ae..567df2db23ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -386,6 +386,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) "%s", "xgmi_hive_info"); if (ret) { dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n"); + kobject_put(&hive->kobj); kfree(hive); hive = NULL; goto pro_end; From 86cdf8e38792545161dbe3350a7eced558ba4d15 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Mon, 15 Nov 2021 22:56:00 +0800 Subject: [PATCH 239/336] NFC: reorganize the functions in nci_request There is a possible data race as shown below: thread-A in nci_request() | thread-B in nci_close_device() | mutex_lock(&ndev->req_lock); test_bit(NCI_UP, &ndev->flags); | ... | test_and_clear_bit(NCI_UP, &ndev->flags) mutex_lock(&ndev->req_lock); | | This race will allow __nci_request() to be awaked while the device is getting removed. Similar to commit e2cb6b891ad2 ("bluetooth: eliminate the potential race condition when removing the HCI controller"). this patch alters the function sequence in nci_request() to prevent the data races between the nci_close_device(). Signed-off-by: Lin Ma Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Link: https://lore.kernel.org/r/20211115145600.8320-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6fd873aa86be..1dd0269c1a72 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -144,12 +144,15 @@ inline int nci_request(struct nci_dev *ndev, { int rc; - if (!test_bit(NCI_UP, &ndev->flags)) - return -ENETDOWN; - /* Serialize all requests */ mutex_lock(&ndev->req_lock); - rc = __nci_request(ndev, req, opt, timeout); + /* check the state after obtaing the lock against any races + * from nci_close_device when the device gets removed. + */ + if (test_bit(NCI_UP, &ndev->flags)) + rc = __nci_request(ndev, req, opt, timeout); + else + rc = -ENETDOWN; mutex_unlock(&ndev->req_lock); return rc; From 3e3b5dfcd16a3e254aab61bd1e8c417dd4503102 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 16 Nov 2021 23:26:52 +0800 Subject: [PATCH 240/336] NFC: reorder the logic in nfc_{un,}register_device There is a potential UAF between the unregistration routine and the NFC netlink operations. The race that cause that UAF can be shown as below: (FREE) | (USE) nfcmrvl_nci_unregister_dev | nfc_genl_dev_up nci_close_device | nci_unregister_device | nfc_get_device nfc_unregister_device | nfc_dev_up rfkill_destory | device_del | rfkill_blocked ... | ... The root cause for this race is concluded below: 1. The rfkill_blocked (USE) in nfc_dev_up is supposed to be placed after the device_is_registered check. 2. Since the netlink operations are possible just after the device_add in nfc_register_device, the nfc_dev_up() can happen anywhere during the rfkill creation process, which leads to data race. This patch reorder these actions to permit 1. Once device_del is finished, the nfc_dev_up cannot dereference the rfkill object. 2. The rfkill_register need to be placed after the device_add of nfc_dev because the parent device need to be created first. So this patch keeps the order but inject device_lock to prevent the data race. Signed-off-by: Lin Ma Fixes: be055b2f89b5 ("NFC: RFKILL support") Reviewed-by: Jakub Kicinski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20211116152652.19217-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- net/nfc/core.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/net/nfc/core.c b/net/nfc/core.c index 3c645c1d99c9..dc7a2404efdf 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -94,13 +94,13 @@ int nfc_dev_up(struct nfc_dev *dev) device_lock(&dev->dev); - if (dev->rfkill && rfkill_blocked(dev->rfkill)) { - rc = -ERFKILL; + if (!device_is_registered(&dev->dev)) { + rc = -ENODEV; goto error; } - if (!device_is_registered(&dev->dev)) { - rc = -ENODEV; + if (dev->rfkill && rfkill_blocked(dev->rfkill)) { + rc = -ERFKILL; goto error; } @@ -1125,11 +1125,7 @@ int nfc_register_device(struct nfc_dev *dev) if (rc) pr_err("Could not register llcp device\n"); - rc = nfc_genl_device_added(dev); - if (rc) - pr_debug("The userspace won't be notified that the device %s was added\n", - dev_name(&dev->dev)); - + device_lock(&dev->dev); dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); if (dev->rfkill) { @@ -1138,6 +1134,12 @@ int nfc_register_device(struct nfc_dev *dev) dev->rfkill = NULL; } } + device_unlock(&dev->dev); + + rc = nfc_genl_device_added(dev); + if (rc) + pr_debug("The userspace won't be notified that the device %s was added\n", + dev_name(&dev->dev)); return 0; } @@ -1154,10 +1156,17 @@ void nfc_unregister_device(struct nfc_dev *dev) pr_debug("dev_name=%s\n", dev_name(&dev->dev)); + rc = nfc_genl_device_removed(dev); + if (rc) + pr_debug("The userspace won't be notified that the device %s " + "was removed\n", dev_name(&dev->dev)); + + device_lock(&dev->dev); if (dev->rfkill) { rfkill_unregister(dev->rfkill); rfkill_destroy(dev->rfkill); } + device_unlock(&dev->dev); if (dev->ops->check_presence) { device_lock(&dev->dev); @@ -1167,11 +1176,6 @@ void nfc_unregister_device(struct nfc_dev *dev) cancel_work_sync(&dev->check_pres_work); } - rc = nfc_genl_device_removed(dev); - if (rc) - pr_debug("The userspace won't be notified that the device %s " - "was removed\n", dev_name(&dev->dev)); - nfc_llcp_unregister_device(dev); mutex_lock(&nfc_devlist_mutex); From 48b71a9e66c2eab60564b1b1c85f4928ed04e406 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 16 Nov 2021 23:27:32 +0800 Subject: [PATCH 241/336] NFC: add NCI_UNREG flag to eliminate the race There are two sites that calls queue_work() after the destroy_workqueue() and lead to possible UAF. The first site is nci_send_cmd(), which can happen after the nci_close_device as below nfcmrvl_nci_unregister_dev | nfc_genl_dev_up nci_close_device | flush_workqueue | del_timer_sync | nci_unregister_device | nfc_get_device destroy_workqueue | nfc_dev_up nfc_unregister_device | nci_dev_up device_del | nci_open_device | __nci_request | nci_send_cmd | queue_work !!! Another site is nci_cmd_timer, awaked by the nci_cmd_work from the nci_send_cmd. ... | ... nci_unregister_device | queue_work destroy_workqueue | nfc_unregister_device | ... device_del | nci_cmd_work | mod_timer | ... | nci_cmd_timer | queue_work !!! For the above two UAF, the root cause is that the nfc_dev_up can race between the nci_unregister_device routine. Therefore, this patch introduce NCI_UNREG flag to easily eliminate the possible race. In addition, the mutex_lock in nci_close_device can act as a barrier. Signed-off-by: Lin Ma Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Reviewed-by: Jakub Kicinski Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20211116152732.19238-1-linma@zju.edu.cn Signed-off-by: Jakub Kicinski --- include/net/nfc/nci_core.h | 1 + net/nfc/nci/core.c | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index a964daedc17b..ea8595651c38 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -30,6 +30,7 @@ enum nci_flag { NCI_UP, NCI_DATA_EXCHANGE, NCI_DATA_EXCHANGE_TO, + NCI_UNREG, }; /* NCI device states */ diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 1dd0269c1a72..d2537383a3e8 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -476,6 +476,11 @@ static int nci_open_device(struct nci_dev *ndev) mutex_lock(&ndev->req_lock); + if (test_bit(NCI_UNREG, &ndev->flags)) { + rc = -ENODEV; + goto done; + } + if (test_bit(NCI_UP, &ndev->flags)) { rc = -EALREADY; goto done; @@ -548,6 +553,10 @@ done: static int nci_close_device(struct nci_dev *ndev) { nci_req_cancel(ndev, ENODEV); + + /* This mutex needs to be held as a barrier for + * caller nci_unregister_device + */ mutex_lock(&ndev->req_lock); if (!test_and_clear_bit(NCI_UP, &ndev->flags)) { @@ -585,8 +594,8 @@ static int nci_close_device(struct nci_dev *ndev) del_timer_sync(&ndev->cmd_timer); - /* Clear flags */ - ndev->flags = 0; + /* Clear flags except NCI_UNREG */ + ndev->flags &= BIT(NCI_UNREG); mutex_unlock(&ndev->req_lock); @@ -1269,6 +1278,12 @@ void nci_unregister_device(struct nci_dev *ndev) { struct nci_conn_info *conn_info, *n; + /* This set_bit is not protected with specialized barrier, + * However, it is fine because the mutex_lock(&ndev->req_lock); + * in nci_close_device() will help to emit one. + */ + set_bit(NCI_UNREG, &ndev->flags); + nci_close_device(ndev); destroy_workqueue(ndev->cmd_wq); From a280ef90af01dc133d0a52387e563015686d6294 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 17 Nov 2021 10:34:54 +0300 Subject: [PATCH 242/336] octeontx2-af: debugfs: don't corrupt user memory The user supplies the "count" value to say how big its read buffer is. The rvu_dbg_lmtst_map_table_display() function does not take the "count" into account but instead just copies the whole table, potentially corrupting the user's data. Introduce the "ret" variable to store how many bytes we can copy. Also I changed the type of "off" to size_t to make using min() simpler. Fixes: 0daa55d033b0 ("octeontx2-af: cn10k: debugfs for dumping LMTST map table") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20211117073454.GD5237@kili Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/rvu_debugfs.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index c7fd466a0efd..a09a507369ac 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -236,10 +236,11 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, u64 lmt_addr, val, tbl_base; int pf, vf, num_vfs, hw_vfs; void __iomem *lmt_map_base; - int index = 0, off = 0; - int bytes_not_copied; int buf_size = 10240; + size_t off = 0; + int index = 0; char *buf; + int ret; /* don't allow partial reads */ if (*ppos != 0) @@ -303,15 +304,17 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, } off += scnprintf(&buf[off], buf_size - 1 - off, "\n"); - bytes_not_copied = copy_to_user(buffer, buf, off); + ret = min(off, count); + if (copy_to_user(buffer, buf, ret)) + ret = -EFAULT; kfree(buf); iounmap(lmt_map_base); - if (bytes_not_copied) - return -EFAULT; + if (ret < 0) + return ret; - *ppos = off; - return off; + *ppos = ret; + return ret; } RVU_DEBUG_FOPS(lmtst_map_table, lmtst_map_table_display, NULL); From 06f6c4c6c3e8354dceddd77bd58f9a7a84c67246 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 15 Nov 2021 12:47:26 +0900 Subject: [PATCH 243/336] ata: libata: add missing ata_identify_page_supported() calls ata_dev_config_ncq_prio() and ata_dev_config_devslp() both access pages of the IDENTIFY DEVICE data log. Before calling ata_read_log_page(), make sure to check for the existence of the IDENTIFY DEVICE data log and of the log page accessed using ata_identify_page_supported(). This avoids useless error messages from ata_read_log_page() and failures with some LLDD scsi drivers using libsas. Reported-by: Nikolay Cc: stable@kernel.org # 5.15 Signed-off-by: Damien Le Moal Tested-by: Matthew Perkowski --- drivers/ata/libata-core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index edaedcd82630..59ad8c979cb3 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2178,6 +2178,9 @@ static void ata_dev_config_ncq_prio(struct ata_device *dev) struct ata_port *ap = dev->link->ap; unsigned int err_mask; + if (!ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS)) + return; + err_mask = ata_read_log_page(dev, ATA_LOG_IDENTIFY_DEVICE, ATA_LOG_SATA_SETTINGS, @@ -2454,7 +2457,8 @@ static void ata_dev_config_devslp(struct ata_device *dev) * Check device sleep capability. Get DevSlp timing variables * from SATA Settings page of Identify Device Data Log. */ - if (!ata_id_has_devslp(dev->id)) + if (!ata_id_has_devslp(dev->id) || + !ata_identify_page_supported(dev, ATA_LOG_SATA_SETTINGS)) return; err_mask = ata_read_log_page(dev, From 1527f69204fe35f341cb599f1cb01bd02daf4374 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 12 Nov 2021 14:15:38 -0600 Subject: [PATCH 244/336] ata: ahci: Add Green Sardine vendor ID as board_ahci_mobile AMD requires that the SATA controller be configured for devsleep in order for S0i3 entry to work properly. commit b1a9585cc396 ("ata: ahci: Enable DEVSLP by default on x86 with SLP_S0") sets up a kernel policy to enable devsleep on Intel mobile platforms that are using s0ix. Add the PCI ID for the SATA controller in Green Sardine platforms to extend this policy by default for AMD based systems using s0i3 as well. Cc: Nehal-bakulchandra Shah BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214091 Signed-off-by: Mario Limonciello Signed-off-by: Damien Le Moal --- drivers/ata/ahci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index d60f34718b5d..1e1167e725a4 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -438,6 +438,7 @@ static const struct pci_device_id ahci_pci_tbl[] = { /* AMD */ { PCI_VDEVICE(AMD, 0x7800), board_ahci }, /* AMD Hudson-2 */ { PCI_VDEVICE(AMD, 0x7900), board_ahci }, /* AMD CZ */ + { PCI_VDEVICE(AMD, 0x7901), board_ahci_mobile }, /* AMD Green Sardine */ /* AMD is using RAID class only for ahci controllers */ { PCI_VENDOR_ID_AMD, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_RAID << 8, 0xffffff, board_ahci }, From 7c5f641a5914ce0303b06bcfcd7674ee64aeebe9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 12 Nov 2021 14:15:39 -0600 Subject: [PATCH 245/336] ata: libahci: Adjust behavior when StorageD3Enable _DSD is set The StorageD3Enable _DSD is used for the vendor to indicate that the disk should be opted into or out of a different behavior based upon the platform design. For AMD's Renoir and Green Sardine platforms it's important that any attached SATA storage has transitioned into DevSlp when s2idle is used. If the disk is left in active/partial/slumber, then the system is not able to resume properly. When the StorageD3Enable _DSD is detected, check the system is using s2idle and DevSlp is enabled and if so explicitly wait long enough for the disk to enter DevSlp. Cc: Nehal-bakulchandra Shah BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=214091 Link: https://docs.microsoft.com/en-us/windows-hardware/design/component-guidelines/power-management-for-storage-hardware-devices-intro Signed-off-by: Mario Limonciello Signed-off-by: Damien Le Moal --- drivers/ata/libahci.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index 8a6835bfd18a..f76b8418e6fb 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -2323,6 +2323,18 @@ int ahci_port_resume(struct ata_port *ap) EXPORT_SYMBOL_GPL(ahci_port_resume); #ifdef CONFIG_PM +static void ahci_handle_s2idle(struct ata_port *ap) +{ + void __iomem *port_mmio = ahci_port_base(ap); + u32 devslp; + + if (pm_suspend_via_firmware()) + return; + devslp = readl(port_mmio + PORT_DEVSLP); + if ((devslp & PORT_DEVSLP_ADSE)) + ata_msleep(ap, devslp_idle_timeout); +} + static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg) { const char *emsg = NULL; @@ -2336,6 +2348,9 @@ static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg) ata_port_freeze(ap); } + if (acpi_storage_d3(ap->host->dev)) + ahci_handle_s2idle(ap); + ahci_rpm_put_port(ap); return rc; } From cac7e8b5f5fa94e28d581fbb9e76cb1c0c7fd56a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Nov 2021 14:31:41 +0900 Subject: [PATCH 246/336] ata: libata-sata: Declare ata_ncq_sdev_attrs static Since ata_ncq_sdev_attrs is a local struct, declare it static. This avoids a sparse warning at compile time. Signed-off-by: Damien Le Moal --- drivers/ata/libata-sata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c index 4e88597aa9df..5b78e86e3459 100644 --- a/drivers/ata/libata-sata.c +++ b/drivers/ata/libata-sata.c @@ -922,7 +922,7 @@ DEVICE_ATTR(ncq_prio_enable, S_IRUGO | S_IWUSR, ata_ncq_prio_enable_show, ata_ncq_prio_enable_store); EXPORT_SYMBOL_GPL(dev_attr_ncq_prio_enable); -struct attribute *ata_ncq_sdev_attrs[] = { +static struct attribute *ata_ncq_sdev_attrs[] = { &dev_attr_unload_heads.attr, &dev_attr_ncq_prio_enable.attr, &dev_attr_ncq_prio_supported.attr, From dc23a5110b106da13502de5735399ad83ed1a682 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Mon, 15 Nov 2021 14:41:31 +0000 Subject: [PATCH 247/336] cpuid: kvm_find_kvm_cpuid_features() should be declared 'static' The lack a static declaration currently results in: arch/x86/kvm/cpuid.c:128:26: warning: no previous prototype for function 'kvm_find_kvm_cpuid_features' when compiling with "W=1". Reported-by: kernel test robot Fixes: 760849b1476c ("KVM: x86: Make sure KVM_CPUID_FEATURES really are KVM_CPUID_FEATURES") Signed-off-by: Paul Durrant Message-Id: <20211115144131.5943-1-pdurrant@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e19dabf1848b..07e9215e911d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -125,7 +125,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) } } -struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) { u32 base = vcpu->arch.kvm_cpuid_base; From 964b7aa0b040bdc6ec1c543ee620cda3f8b4c68a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 14 Nov 2021 08:59:02 +0000 Subject: [PATCH 248/336] KVM: Fix steal time asm constraints In 64-bit mode, x86 instruction encoding allows us to use the low 8 bits of any GPR as an 8-bit operand. In 32-bit mode, however, we can only use the [abcd] registers. For which, GCC has the "q" constraint instead of the less restrictive "r". Also fix st->preempted, which is an input/output operand rather than an input. Fixes: 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status") Reported-by: kernel test robot Signed-off-by: David Woodhouse Message-Id: <89bf72db1b859990355f9c40713a34e0d2d86c98.camel@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2c03b76caf11..4ae77e1dadf6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3307,9 +3307,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu) "xor %1, %1\n" "2:\n" _ASM_EXTABLE_UA(1b, 2b) - : "+r" (st_preempted), - "+&r" (err) - : "m" (st->preempted)); + : "+q" (st_preempted), + "+&r" (err), + "+m" (st->preempted)); if (err) goto out; From af957eebfcc17433ee83ab85b1195a933ab5049c Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 15 Nov 2021 15:18:36 +0200 Subject: [PATCH 249/336] KVM: nVMX: don't use vcpu->arch.efer when checking host state on nested state load When loading nested state, don't use check vcpu->arch.efer to get the L1 host's 64-bit vs. 32-bit state and don't check it for consistency with respect to VM_EXIT_HOST_ADDR_SPACE_SIZE, as register state in vCPU may be stale when KVM_SET_NESTED_STATE is called---and architecturally does not exist. When restoring L2 state in KVM, the CPU is placed in non-root where nested VMX code has no snapshot of L1 host state: VMX (conditionally) loads host state fields loaded on VM-exit, but they need not correspond to the state before entry. A simple case occurs in KVM itself, where the host RIP field points to vmx_vmexit rather than the instruction following vmlaunch/vmresume. However, for the particular case of L1 being in 32- or 64-bit mode on entry, the exit controls can be treated instead as the source of truth regarding the state of L1 on entry, and can be used to check that vmcs12.VM_EXIT_HOST_ADDR_SPACE_SIZE matches vmcs12.HOST_EFER if vmcs12.VM_EXIT_LOAD_IA32_EFER is set. The consistency check on CPU EFER vs. vmcs12.VM_EXIT_HOST_ADDR_SPACE_SIZE, instead, happens only on VM-Enter. That's because, again, there's conceptually no "current" L1 EFER to check on KVM_SET_NESTED_STATE. Suggested-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Message-Id: <20211115131837.195527-2-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b213ca966d41..e307d3c1d26b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2830,6 +2830,17 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ +#ifdef CONFIG_X86_64 + if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != + !!(vcpu->arch.efer & EFER_LMA))) + return -EINVAL; +#endif + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2854,18 +2865,16 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; #ifdef CONFIG_X86_64 - ia32e = !!(vcpu->arch.efer & EFER_LMA); + ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); #else ia32e = false; #endif if (ia32e) { - if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || - CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; } else { - if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || - CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || CC((vmcs12->host_rip) >> 32)) return -EINVAL; @@ -3535,6 +3544,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (nested_vmx_check_controls(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + if (nested_vmx_check_address_space_size(vcpu, vmcs12)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + if (nested_vmx_check_host_state(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); From b8453cdcf26020030da182f0156d7bf59ae5719f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 15 Nov 2021 15:18:37 +0200 Subject: [PATCH 250/336] KVM: x86/mmu: include EFER.LMA in extended mmu role Incorporate EFER.LMA into kvm_mmu_extended_role, as it used to compute the guest root level and is not reflected in kvm_mmu_page_role.level when TDP is in use. When simply running the guest, it is impossible for EFER.LMA and kvm_mmu.root_level to get out of sync, as the guest cannot transition from PAE paging to 64-bit paging without toggling CR0.PG, i.e. without first bouncing through a different MMU context. And stuffing guest state via KVM_SET_SREGS{,2} also ensures a full MMU context reset. However, if KVM_SET_SREGS{,2} is followed by KVM_SET_NESTED_STATE, e.g. to set guest state when migrating the VM while L2 is active, the vCPU state will reflect L2, not L1. If L1 is using TDP for L2, then root_mmu will have been configured using L2's state, despite not being used for L2. If L2.EFER.LMA != L1.EFER.LMA, and L2 is using PAE paging, then root_mmu will be configured for guest PAE paging, but will match the mmu_role for 64-bit paging and cause KVM to not reconfigure root_mmu on the next nested VM-Exit. Alternatively, the root_mmu's role could be invalidated after a successful KVM_SET_NESTED_STATE that yields vcpu->arch.mmu != vcpu->arch.root_mmu, i.e. that switches the active mmu to guest_mmu, but doing so is unnecessarily tricky, and not even needed if L1 and L2 do have the same role (e.g., they are both 64-bit guests and run with the same CR4). Suggested-by: Sean Christopherson Signed-off-by: Maxim Levitsky Message-Id: <20211115131837.195527-3-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33e3292233f3..e977634333d4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -363,6 +363,7 @@ union kvm_mmu_extended_role { unsigned int cr4_smap:1; unsigned int cr4_smep:1; unsigned int cr4_la57:1; + unsigned int efer_lma:1; }; }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 04c00c34517e..0571b1c7bf6f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4682,6 +4682,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, /* PKEY and LA57 are active iff long mode is active. */ ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); + ext.efer_lma = ____is_efer_lma(regs); } ext.valid = 1; From 4e8436479ad3be76a3823e6ce466ae464ce71300 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:21 +0000 Subject: [PATCH 251/336] KVM: x86/xen: Fix get_attr of KVM_XEN_ATTR_TYPE_SHARED_INFO In commit 319afe68567b ("KVM: xen: do not use struct gfn_to_hva_cache") we stopped storing this in-kernel as a GPA, and started storing it as a GFN. Which means we probably should have stopped calling gpa_to_gfn() on it when userspace asks for it back. Cc: stable@vger.kernel.org Fixes: 319afe68567b ("KVM: xen: do not use struct gfn_to_hva_cache") Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9ea9c3dabe37..79090dbe1e66 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -282,7 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); + data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn; r = 0; break; From 297d597a6da38fc1b40fa470044a767e33801438 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:24 +0000 Subject: [PATCH 252/336] KVM: nVMX: Use kvm_{read,write}_guest_cached() for shadow_vmcs12 Using kvm_vcpu_map() for reading from the guest is entirely gratuitous, when all we do is a single memcpy and unmap it again. Fix it up to use kvm_read_guest()... but in fact I couldn't bring myself to do that without also making it use a gfn_to_hva_cache for both that *and* the copy in the other direction. Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-5-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 24 +++++++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 +++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e307d3c1d26b..fff6b326dc2b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -670,33 +670,39 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - struct kvm_host_map map; - struct vmcs12 *shadow; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; - shadow = get_shadow_vmcs12(vcpu); - - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - memcpy(shadow, map.hva, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; - kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, - get_shadow_vmcs12(vcpu), VMCS12_SIZE); + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) + return; + + kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a4ead6023133..cdadbd5dc0ca 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -141,6 +141,11 @@ struct nested_vmx { */ struct vmcs12 *cached_shadow_vmcs12; + /* + * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer + */ + struct gfn_to_hva_cache shadow_vmcs12_cache; + /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. From 6a834754a568bb809a1466001a0523dab8e6adef Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:23 +0000 Subject: [PATCH 253/336] KVM: x86/xen: Use sizeof_field() instead of open-coding it Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-4-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/xen.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 79090dbe1e66..272be5c1ebed 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -127,9 +127,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) state_entry_time = vx->runstate_entry_time; state_entry_time |= XEN_RUNSTATE_UPDATE; - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); - BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) != + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -144,9 +144,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) != + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -163,9 +163,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != - sizeof(((struct compat_vcpu_runstate_info *)0)->time)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != + sizeof_field(struct compat_vcpu_runstate_info, time)); + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -204,9 +204,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != - sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending)); + sizeof_field(struct vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != - sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending)); + sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); /* * For efficiency, this mirrors the checks for using the valid From 7d0172b3ca4231f0a0566157e9a28a3d3ffc0142 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:25 +0000 Subject: [PATCH 254/336] KVM: nVMX: Use kvm_read_guest_offset_cached() for nested VMCS check Kill another mostly gratuitous kvm_vcpu_map() which could just use the userspace HVA for it. Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-6-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index fff6b326dc2b..4fb904ff7f12 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2925,9 +2925,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int r = 0; - struct vmcs12 *shadow; - struct kvm_host_map map; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; + struct vmcs_hdr hdr; if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; @@ -2935,17 +2935,21 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; - if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) + if (ghc->gpa != vmcs12->vmcs_link_pointer && + CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE))) + return -EINVAL; + + if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr)))) return -EINVAL; - shadow = map.hva; + if (CC(hdr.revision_id != VMCS12_REVISION) || + CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) + return -EINVAL; - if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || - CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) - r = -EINVAL; - - kvm_vcpu_unmap(vcpu, &map, false); - return r; + return 0; } /* From cee66664dcd6241a943380ef9dcd2f8a0a7dc47d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:26 +0000 Subject: [PATCH 255/336] KVM: nVMX: Use a gfn_to_hva_cache for vmptrld And thus another call to kvm_vcpu_map() can die. Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-7-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 26 +++++++++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 +++++ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4fb904ff7f12..1e2f66951566 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5286,10 +5286,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; if (vmx->nested.current_vmptr != vmptr) { - struct kvm_host_map map; - struct vmcs12 *new_vmcs12; + struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; + struct vmcs_hdr hdr; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { + if (ghc->gpa != vmptr && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the @@ -5300,12 +5301,16 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } - new_vmcs12 = map.hva; + if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr))) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } - if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - (new_vmcs12->hdr.shadow_vmcs && + if (hdr.revision_id != VMCS12_REVISION || + (hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kvm_vcpu_unmap(vcpu, &map, false); return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5316,8 +5321,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * Load VMCS12 from guest memory since it is not already * cached. */ - memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, + VMCS12_SIZE)) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } set_current_vmptr(vmx, vmptr); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index cdadbd5dc0ca..4df2ac24ffc1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -146,6 +146,11 @@ struct nested_vmx { */ struct gfn_to_hva_cache shadow_vmcs12_cache; + /* + * GPA to HVA cache for VMCS12 + */ + struct gfn_to_hva_cache vmcs12_cache; + /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. From 357a18ad230f0867791b788d2b1d6f280f6f6e61 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 15 Nov 2021 16:50:27 +0000 Subject: [PATCH 256/336] KVM: Kill kvm_map_gfn() / kvm_unmap_gfn() and gfn_to_pfn_cache In commit 7e2175ebd695 ("KVM: x86: Fix recording of guest steal time / preempted status") I removed the only user of these functions because it was basically impossible to use them safely. There are two stages to the GFN->PFN mapping; first through the KVM memslots to a userspace HVA and then through the page tables to translate that HVA to an underlying PFN. Invalidations of the former were being handled correctly, but no attempt was made to use the MMU notifiers to invalidate the cache when the HVA->GFN mapping changed. As a prelude to reinventing the gfn_to_pfn_cache with more usable semantics, rip it out entirely and untangle the implementation of the unsafe kvm_vcpu_map()/kvm_vcpu_unmap() functions from it. All current users of kvm_vcpu_map() also look broken right now, and will be dealt with separately. They broadly fall into two classes: * Those which map, access the data and immediately unmap. This is mostly gratuitous and could just as well use the existing user HVA, and could probably benefit from a gfn_to_hva_cache as they do so. * Those which keep the mapping around for a longer time, perhaps even using the PFN directly from the guest. These will need to be converted to the new gfn_to_pfn_cache and then kvm_vcpu_map() can be removed too. Signed-off-by: David Woodhouse Message-Id: <20211115165030.7422-8-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 +-- include/linux/kvm_types.h | 7 --- virt/kvm/kvm_main.c | 100 +++++--------------------------------- 3 files changed, 12 insertions(+), 101 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 60a35d9fe259..eb625af4fc5e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -866,7 +866,7 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); @@ -942,12 +942,8 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool atomic); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool dirty, bool atomic); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 2237abb93ccd..234eab059839 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -53,13 +53,6 @@ struct gfn_to_hva_cache { struct kvm_memory_slot *memslot; }; -struct gfn_to_pfn_cache { - u64 generation; - gfn_t gfn; - kvm_pfn_t pfn; - bool dirty; -}; - #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE /* * Memory caches are used to preallocate memory ahead of various MMU flows, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3f6d450355f0..7a28c29dca8a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2548,72 +2548,36 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) +void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) { if (pfn == 0) return; - if (cache) - cache->pfn = cache->gfn = 0; - if (dirty) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); } -static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, - struct gfn_to_pfn_cache *cache, u64 gen) -{ - kvm_release_pfn(cache->pfn, cache->dirty, cache); - - cache->pfn = gfn_to_pfn_memslot(slot, gfn); - cache->gfn = gfn; - cache->dirty = false; - cache->generation = gen; -} - -static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, - struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, - bool atomic) +int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { kvm_pfn_t pfn; void *hva = NULL; struct page *page = KVM_UNMAPPED_PAGE; - struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); - u64 gen = slots->generation; if (!map) return -EINVAL; - if (cache) { - if (!cache->pfn || cache->gfn != gfn || - cache->generation != gen) { - if (atomic) - return -EAGAIN; - kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); - } - pfn = cache->pfn; - } else { - if (atomic) - return -EAGAIN; - pfn = gfn_to_pfn_memslot(slot, gfn); - } + pfn = gfn_to_pfn(vcpu->kvm, gfn); if (is_error_noslot_pfn(pfn)) return -EINVAL; if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (atomic) - hva = kmap_atomic(page); - else - hva = kmap(page); + hva = kmap(page); #ifdef CONFIG_HAS_IOMEM - } else if (!atomic) { - hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); } else { - return -EINVAL; + hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); #endif } @@ -2627,27 +2591,9 @@ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, return 0; } - -int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool atomic) -{ - return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, - cache, atomic); -} -EXPORT_SYMBOL_GPL(kvm_map_gfn); - -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) -{ - return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, - NULL, false); -} EXPORT_SYMBOL_GPL(kvm_vcpu_map); -static void __kvm_unmap_gfn(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, - bool dirty, bool atomic) +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) { if (!map) return; @@ -2655,45 +2601,21 @@ static void __kvm_unmap_gfn(struct kvm *kvm, if (!map->hva) return; - if (map->page != KVM_UNMAPPED_PAGE) { - if (atomic) - kunmap_atomic(map->hva); - else - kunmap(map->page); - } + if (map->page != KVM_UNMAPPED_PAGE) + kunmap(map->page); #ifdef CONFIG_HAS_IOMEM - else if (!atomic) - memunmap(map->hva); else - WARN_ONCE(1, "Unexpected unmapping in atomic context"); + memunmap(map->hva); #endif if (dirty) - mark_page_dirty_in_slot(kvm, memslot, map->gfn); + kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - if (cache) - cache->dirty |= dirty; - else - kvm_release_pfn(map->pfn, dirty, NULL); + kvm_release_pfn(map->pfn, dirty); map->hva = NULL; map->page = NULL; } - -int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, - struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) -{ - __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map, - cache, dirty, atomic); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_unmap_gfn); - -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) -{ - __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), - map, NULL, dirty, false); -} EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) From 79b11142763791bdead8b6460052cbdde8e08e2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Nov 2021 21:50:56 +0000 Subject: [PATCH 257/336] KVM: SEV: Disallow COPY_ENC_CONTEXT_FROM if target has created vCPUs Reject COPY_ENC_CONTEXT_FROM if the destination VM has created vCPUs. KVM relies on SEV activation to occur before vCPUs are created, e.g. to set VMCB flags and intercepts correctly. Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context") Cc: stable@vger.kernel.org Cc: Peter Gonda Cc: Marc Orr Cc: Sean Christopherson Cc: Nathan Tempelman Cc: Brijesh Singh Cc: Tom Lendacky Signed-off-by: Sean Christopherson Message-Id: <20211109215101.2211373-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 3e2769855e51..eeec499e4372 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1775,7 +1775,12 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mutex_unlock(&source_kvm->lock); mutex_lock(&kvm->lock); - if (sev_guest(kvm)) { + /* + * Disallow out-of-band SEV/SEV-ES init if the target is already an + * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being + * created after SEV/SEV-ES initialization, e.g. to init intercepts. + */ + if (sev_guest(kvm) || kvm->created_vcpus) { ret = -EINVAL; goto e_mirror_unlock; } From a41fb26e61697382b2428ae63e039e97b0e6d164 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Nov 2021 21:50:58 +0000 Subject: [PATCH 258/336] KVM: SEV: Set sev_info.active after initial checks in sev_guest_init() Set sev_info.active during SEV/SEV-ES activation before calling any code that can potentially consume sev_info.es_active, e.g. set "active" and "es_active" as a pair immediately after the initial sanity checks. KVM generally expects that es_active can be true if and only if active is true, e.g. sev_asid_new() deliberately avoids sev_es_guest() so that it doesn't get a false negative. This will allow WARNing in sev_es_guest() if the VM is tagged as SEV-ES but not SEV. Signed-off-by: Sean Christopherson Message-Id: <20211109215101.2211373-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index eeec499e4372..50b9d76e9137 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -229,7 +229,6 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - bool es_active = argp->id == KVM_SEV_ES_INIT; int asid, ret; if (kvm->created_vcpus) @@ -239,7 +238,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (unlikely(sev->active)) return ret; - sev->es_active = es_active; + sev->active = true; + sev->es_active = argp->id == KVM_SEV_ES_INIT; asid = sev_asid_new(sev); if (asid < 0) goto e_no_asid; @@ -249,7 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (ret) goto e_free; - sev->active = true; sev->asid = asid; INIT_LIST_HEAD(&sev->regions_list); @@ -260,6 +259,7 @@ e_free: sev->asid = 0; e_no_asid: sev->es_active = false; + sev->active = false; return ret; } From 1bd00a4257a86db654499137fd8e6db7d1e484dc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Nov 2021 21:50:59 +0000 Subject: [PATCH 259/336] KVM: SEV: WARN if SEV-ES is marked active but SEV is not WARN if the VM is tagged as SEV-ES but not SEV. KVM relies on SEV and SEV-ES being set atomically, and guards common flows with "is SEV", i.e. observing SEV-ES without SEV means KVM has a fatal bug. Signed-off-by: Sean Christopherson Message-Id: <20211109215101.2211373-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 0d7bbe548ac3..a345f557be4a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -242,7 +242,7 @@ static inline bool sev_es_guest(struct kvm *kvm) #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - return sev_guest(kvm) && sev->es_active; + return sev->es_active && !WARN_ON_ONCE(!sev->active); #else return false; #endif From ea410ef4dad6282420bde00e1ffca874344f0e95 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Nov 2021 21:51:00 +0000 Subject: [PATCH 260/336] KVM: SEV: Drop a redundant setting of sev->asid during initialization Remove a fully redundant write to sev->asid during SEV/SEV-ES guest initialization. The ASID is set a few lines earlier prior to the call to sev_platform_init(), which doesn't take "sev" as a param, i.e. can't muck with the ASID barring some truly magical behind-the-scenes code. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20211109215101.2211373-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 50b9d76e9137..80692435ac3d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -249,7 +249,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (ret) goto e_free; - sev->asid = asid; INIT_LIST_HEAD(&sev->regions_list); return 0; From 8e38e96a4e616ed0936faa964ceeb5d390b6425e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 9 Nov 2021 21:51:01 +0000 Subject: [PATCH 261/336] KVM: SEV: Fix typo in and tweak name of cmd_allowed_from_miror() Rename cmd_allowed_from_miror() to is_cmd_allowed_from_mirror(), fixing a typo and making it obvious that the result is a boolean where false means "not allowed". No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20211109215101.2211373-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 80692435ac3d..87874c586531 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1509,7 +1509,7 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } -static bool cmd_allowed_from_miror(u32 cmd_id) +static bool is_cmd_allowed_from_mirror(u32 cmd_id) { /* * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES @@ -1541,7 +1541,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) /* Only the enc_context_owner handles some memory enc operations. */ if (is_mirroring_enc_context(kvm) && - !cmd_allowed_from_miror(sev_cmd.id)) { + !is_cmd_allowed_from_mirror(sev_cmd.id)) { r = -EINVAL; goto out; } From 0e2e641921000ffc647b12918cdfcc504a9f6e3b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 6 Nov 2021 20:47:06 -0700 Subject: [PATCH 262/336] riscv: kvm: fix non-kernel-doc comment block Don't use "/**" to begin a comment block for a non-kernel-doc comment. Prevents this docs build warning: vcpu_sbi.c:3: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Copyright (c) 2019 Western Digital Corporation or its affiliates. Fixes: dea8ee31a039 ("RISC-V: KVM: Add SBI v0.1 support") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Atish Patra Cc: Anup Patel Cc: kvm@vger.kernel.org Cc: kvm-riscv@lists.infradead.org Cc: linux-riscv@lists.infradead.org Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Message-Id: <20211107034706.30672-1-rdunlap@infradead.org> Signed-off-by: Paolo Bonzini --- arch/riscv/kvm/vcpu_sbi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index eb3c045edf11..3b0e703d22cf 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * Copyright (c) 2019 Western Digital Corporation or its affiliates. * * Authors: From b768f60bd9791c528a4a4b98659674787b2e7c63 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Nov 2021 12:03:25 -0300 Subject: [PATCH 263/336] selftests: KVM: Add /x86_64/sev_migrate_tests to .gitignore $ git status nothing to commit, working tree clean $ $ make -C tools/testing/selftests/kvm/ > /dev/null 2>&1 $ git status Untracked files: (use "git add ..." to include in what will be committed) tools/testing/selftests/kvm/x86_64/sev_migrate_tests nothing added to commit but untracked files present (use "git add" to track) $ Fixes: 6a58150859fdec76 ("selftest: KVM: Add intra host migration tests") Cc: Brijesh Singh Cc: David Rientjes Cc: Linus Torvalds Cc: Marc Orr Cc: Paolo Bonzini Cc: Peter Gonda Cc: Sean Christopherson Cc: Tom Lendacky Signed-off-by: Arnaldo Carvalho de Melo Message-Id: Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index d4a830139683..3763105029fb 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -23,6 +23,7 @@ /x86_64/platform_info_test /x86_64/set_boot_cpu_id /x86_64/set_sregs_test +/x86_64/sev_migrate_tests /x86_64/smm_test /x86_64/state_test /x86_64/svm_vmcall_test From b5aead0064f33ae5e693a364e3204fe1c0ac9af2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 24 May 2021 12:48:57 -0500 Subject: [PATCH 264/336] KVM: x86: Assume a 64-bit hypercall for guests with protected state When processing a hypercall for a guest with protected state, currently SEV-ES guests, the guest CS segment register can't be checked to determine if the guest is in 64-bit mode. For an SEV-ES guest, it is expected that communication between the guest and the hypervisor is performed to shared memory using the GHCB. In order to use the GHCB, the guest must have been in long mode, otherwise writes by the guest to the GHCB would be encrypted and not be able to be comprehended by the hypervisor. Create a new helper function, is_64_bit_hypercall(), that assumes the guest is in 64-bit mode when the guest has protected state, and returns true, otherwise invoking is_64_bit_mode() to determine the mode. Update the hypercall related routines to use is_64_bit_hypercall() instead of is_64_bit_mode(). Add a WARN_ON_ONCE() to is_64_bit_mode() to catch occurences of calls to this helper function for a guest running with protected state. Fixes: f1c6366e3043 ("KVM: SVM: Add required changes to support intercepts under SEV-ES") Reported-by: Sean Christopherson Signed-off-by: Tom Lendacky Message-Id: Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86.h | 12 ++++++++++++ arch/x86/kvm/xen.c | 2 +- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4a555f32885a..5e19e6e4c2ce 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2022,7 +2022,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; - longmode = is_64_bit_mode(vcpu); + longmode = is_64_bit_hypercall(vcpu); if (longmode) kvm_rax_write(vcpu, result); else { @@ -2171,7 +2171,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } #ifdef CONFIG_X86_64 - if (is_64_bit_mode(vcpu)) { + if (is_64_bit_hypercall(vcpu)) { hc.param = kvm_rcx_read(vcpu); hc.ingpa = kvm_rdx_read(vcpu); hc.outgpa = kvm_r8_read(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c85d70107057..ecd2041570d1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8848,7 +8848,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hypercall(nr, a0, a1, a2, a3); - op_64_bit = is_64_bit_mode(vcpu); + op_64_bit = is_64_bit_hypercall(vcpu); if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ea264c4502e4..997669ae9caa 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -153,12 +153,24 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) { int cs_db, cs_l; + WARN_ON_ONCE(vcpu->arch.guest_state_protected); + if (!is_long_mode(vcpu)) return false; static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); return cs_l; } +static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu) +{ + /* + * If running with protected guest state, the CS register is not + * accessible. The hypercall register values will have had to been + * provided in 64-bit mode, so assume the guest is in 64-bit. + */ + return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu); +} + static inline bool x86_exception_has_error_code(unsigned int vector) { static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) | diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 565da9c3853b..dff2bdf9507a 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -698,7 +698,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - longmode = is_64_bit_mode(vcpu); + longmode = is_64_bit_hypercall(vcpu); if (!longmode) { params[0] = (u32)kvm_rbx_read(vcpu); params[1] = (u32)kvm_rcx_read(vcpu); From f60a00d7295057cb4baea5a321501efc72794453 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 16 Nov 2021 17:34:38 +0100 Subject: [PATCH 265/336] KVM: arm64: Cap KVM_CAP_NR_VCPUS by kvm_arm_default_max_vcpus() Generally, it doesn't make sense to return the recommended maximum number of vCPUs which exceeds the maximum possible number of vCPUs. Note: ARM64 is special as the value returned by KVM_CAP_MAX_VCPUS differs depending on whether it is a system-wide ioctl or a per-VM one. Previously, KVM_CAP_NR_VCPUS didn't have this difference and it seems preferable to keep the status quo. Cap KVM_CAP_NR_VCPUS by kvm_arm_default_max_vcpus() which is what gets returned by system-wide KVM_CAP_MAX_VCPUS. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211116163443.88707-2-vkuznets@redhat.com> Acked-by: Marc Zyngier Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/arm.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 2f03cbfefe67..e4727dc771bf 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -223,7 +223,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); + /* + * ARM64 treats KVM_CAP_NR_CPUS differently from all other + * architectures, as it does not always bound it to + * KVM_CAP_MAX_VCPUS. It should not matter much because + * this is just an advisory value. + */ + r = min_t(unsigned int, num_online_cpus(), + kvm_arm_default_max_vcpus()); break; case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: From 57a2e13ebdda8b65602b44ec8b80e385603eb84c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 16 Nov 2021 17:34:39 +0100 Subject: [PATCH 266/336] KVM: MIPS: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS It doesn't make sense to return the recommended maximum number of vCPUs which exceeds the maximum possible number of vCPUs. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211116163443.88707-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 562aa878b266..aa20d074d388 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1067,7 +1067,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; From b7915d55b1ac0e68a7586697fa2d06c018135c49 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 16 Nov 2021 17:34:40 +0100 Subject: [PATCH 267/336] KVM: PPC: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS It doesn't make sense to return the recommended maximum number of vCPUs which exceeds the maximum possible number of vCPUs. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211116163443.88707-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 35e9cccdeef9..a72920f4f221 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -641,9 +641,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * implementations just count online CPUs. */ if (hv_enabled) - r = num_present_cpus(); + r = min_t(unsigned int, num_present_cpus(), KVM_MAX_VCPUS); else - r = num_online_cpus(); + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; From 37fd3ce1e64a2e86a986542a17bebf1553512da9 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 16 Nov 2021 17:34:41 +0100 Subject: [PATCH 268/336] KVM: RISC-V: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS It doesn't make sense to return the recommended maximum number of vCPUs which exceeds the maximum possible number of vCPUs. Signed-off-by: Vitaly Kuznetsov Acked-by: Anup Patel Reviewed-by: Anup Patel Message-Id: <20211116163443.88707-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/riscv/kvm/vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 26399df15b63..fb18af34a4b5 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -74,7 +74,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; From 82cc27eff4486f8e79ef8faac1af1f5573039aa4 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 16 Nov 2021 17:34:42 +0100 Subject: [PATCH 269/336] KVM: s390: Cap KVM_CAP_NR_VCPUS by num_online_cpus() KVM_CAP_NR_VCPUS is a legacy advisory value which on other architectures return num_online_cpus() caped by KVM_CAP_NR_VCPUS or something else (ppc and arm64 are special cases). On s390, KVM_CAP_NR_VCPUS returns the same as KVM_CAP_MAX_VCPUS and this may turn out to be a bad 'advice'. Switch s390 to returning caped num_online_cpus() too. Acked-by: Christian Borntraeger Signed-off-by: Vitaly Kuznetsov Reviewed-by: Christian Borntraeger Message-Id: <20211116163443.88707-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c6257f625929..14a18ba5ff2c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -585,6 +585,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; else if (sclp.has_esca && sclp.has_64bscao) r = KVM_S390_ESCA_CPU_SLOTS; + if (ext == KVM_CAP_NR_VCPUS) + r = min_t(unsigned int, num_online_cpus(), r); break; case KVM_CAP_S390_COW: r = MACHINE_HAS_ESOP; From 2845e7353bc334d43309f5ea6d376c8fdbc94c93 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 16 Nov 2021 17:34:43 +0100 Subject: [PATCH 270/336] KVM: x86: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS It doesn't make sense to return the recommended maximum number of vCPUs which exceeds the maximum possible number of vCPUs. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211116163443.88707-7-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ecd2041570d1..5a403d92833f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4179,7 +4179,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; From 4d7804d201f2588469798faaab7b54caeb67410b Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 15 Nov 2021 17:34:50 +0100 Subject: [PATCH 271/336] parisc: Include stringify.h to avoid build error in crypto/api.c Include stringify.h to avoid this build error: arch/parisc/include/asm/jump_label.h: error: expected ':' before '__stringify' arch/parisc/include/asm/jump_label.h: error: label 'l_yes' defined but not used [-Werror=unused-label] Signed-off-by: Helge Deller Reported-by: kernel test robot --- arch/parisc/include/asm/jump_label.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h index 7efb1aa2f7f8..af2a598bc0f8 100644 --- a/arch/parisc/include/asm/jump_label.h +++ b/arch/parisc/include/asm/jump_label.h @@ -5,6 +5,7 @@ #ifndef __ASSEMBLY__ #include +#include #include #define JUMP_LABEL_NOP_SIZE 4 From 8f663eb3b7e8c4c88919be8c42768a8100ca6060 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 16 Nov 2021 13:11:26 +0100 Subject: [PATCH 272/336] parisc: Wire up futex_waitv Signed-off-by: Helge Deller --- arch/parisc/kernel/syscalls/syscall.tbl | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index bf751e0732b7..358c00000755 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -446,3 +446,4 @@ 446 common landlock_restrict_self sys_landlock_restrict_self # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv From 4017b230c960c9d6c6a19e52bf3df01222dc7737 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 16 Nov 2021 13:12:21 +0100 Subject: [PATCH 273/336] parisc: Wrap assembler related defines inside __ASSEMBLY__ Building allmodconfig shows errors in the gpu/drm/msm snapdragon drivers, because a COND() define is used there which conflicts with the COND() for PA-RISC assembly. Although the snapdragon driver isn't relevant for parisc, it is nevertheless compiled when CONFIG_COMPILE_TEST is defined. Move the COND() define and other PA-RISC mnemonics inside the #ifdef __ASSEMBLY__ part to avoid this conflict. Signed-off-by: Helge Deller Reported-by: kernel test robot --- arch/parisc/include/asm/assembly.h | 44 ++++++++++++++++-------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h index 7085df079702..39e7985086f9 100644 --- a/arch/parisc/include/asm/assembly.h +++ b/arch/parisc/include/asm/assembly.h @@ -3,38 +3,19 @@ * Copyright (C) 1999 Hewlett-Packard (Frank Rowand) * Copyright (C) 1999 Philipp Rumpf * Copyright (C) 1999 SuSE GmbH + * Copyright (C) 2021 Helge Deller */ #ifndef _PARISC_ASSEMBLY_H #define _PARISC_ASSEMBLY_H -#define CALLEE_FLOAT_FRAME_SIZE 80 - #ifdef CONFIG_64BIT -#define LDREG ldd -#define STREG std -#define LDREGX ldd,s -#define LDREGM ldd,mb -#define STREGM std,ma -#define SHRREG shrd -#define SHLREG shld -#define ANDCM andcm,* -#define COND(x) * ## x #define RP_OFFSET 16 #define FRAME_SIZE 128 #define CALLEE_REG_FRAME_SIZE 144 #define REG_SZ 8 #define ASM_ULONG_INSN .dword #else /* CONFIG_64BIT */ -#define LDREG ldw -#define STREG stw -#define LDREGX ldwx,s -#define LDREGM ldwm -#define STREGM stwm -#define SHRREG shr -#define SHLREG shlw -#define ANDCM andcm -#define COND(x) x #define RP_OFFSET 20 #define FRAME_SIZE 64 #define CALLEE_REG_FRAME_SIZE 128 @@ -45,6 +26,7 @@ /* Frame alignment for 32- and 64-bit */ #define FRAME_ALIGN 64 +#define CALLEE_FLOAT_FRAME_SIZE 80 #define CALLEE_SAVE_FRAME_SIZE (CALLEE_REG_FRAME_SIZE + CALLEE_FLOAT_FRAME_SIZE) #ifdef CONFIG_PA20 @@ -67,6 +49,28 @@ #ifdef __ASSEMBLY__ +#ifdef CONFIG_64BIT +#define LDREG ldd +#define STREG std +#define LDREGX ldd,s +#define LDREGM ldd,mb +#define STREGM std,ma +#define SHRREG shrd +#define SHLREG shld +#define ANDCM andcm,* +#define COND(x) * ## x +#else /* CONFIG_64BIT */ +#define LDREG ldw +#define STREG stw +#define LDREGX ldwx,s +#define LDREGM ldwm +#define STREGM stwm +#define SHRREG shr +#define SHLREG shlw +#define ANDCM andcm +#define COND(x) x +#endif + #ifdef CONFIG_64BIT /* the 64-bit pa gnu assembler unfortunately defaults to .level 1.1 or 2.0 so * work around that for now... */ From 79df39d535c7a3770856fe9f5aba8c0ad1eebdb6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 17 Nov 2021 11:05:07 +0100 Subject: [PATCH 274/336] Revert "parisc: Reduce sigreturn trampoline to 3 instructions" This reverts commit e4f2006f1287e7ea17660490569cff323772dac4. This patch shows problems with signal handling. Revert it for now. Signed-off-by: Helge Deller Cc: # v5.15 --- arch/parisc/include/asm/rt_sigframe.h | 2 +- arch/parisc/kernel/signal.c | 13 +++++++------ arch/parisc/kernel/signal32.h | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/parisc/include/asm/rt_sigframe.h b/arch/parisc/include/asm/rt_sigframe.h index 4b9e3d707571..2b3010ade00e 100644 --- a/arch/parisc/include/asm/rt_sigframe.h +++ b/arch/parisc/include/asm/rt_sigframe.h @@ -2,7 +2,7 @@ #ifndef _ASM_PARISC_RT_SIGFRAME_H #define _ASM_PARISC_RT_SIGFRAME_H -#define SIGRETURN_TRAMP 3 +#define SIGRETURN_TRAMP 4 #define SIGRESTARTBLOCK_TRAMP 5 #define TRAMP_SIZE (SIGRETURN_TRAMP + SIGRESTARTBLOCK_TRAMP) diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index bbfe23c40c01..46b1050640b8 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -288,21 +288,22 @@ setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs, already in userspace. The first words of tramp are used to save the previous sigrestartblock trampoline that might be on the stack. We start the sigreturn trampoline at - SIGRESTARTBLOCK_TRAMP. */ + SIGRESTARTBLOCK_TRAMP+X. */ err |= __put_user(in_syscall ? INSN_LDI_R25_1 : INSN_LDI_R25_0, &frame->tramp[SIGRESTARTBLOCK_TRAMP+0]); - err |= __put_user(INSN_BLE_SR2_R0, + err |= __put_user(INSN_LDI_R20, &frame->tramp[SIGRESTARTBLOCK_TRAMP+1]); - err |= __put_user(INSN_LDI_R20, + err |= __put_user(INSN_BLE_SR2_R0, &frame->tramp[SIGRESTARTBLOCK_TRAMP+2]); + err |= __put_user(INSN_NOP, &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]); - start = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+0]; - end = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP+3]; + start = (unsigned long) &frame->tramp[0]; + end = (unsigned long) &frame->tramp[TRAMP_SIZE]; flush_user_dcache_range_asm(start, end); flush_user_icache_range_asm(start, end); /* TRAMP Words 0-4, Length 5 = SIGRESTARTBLOCK_TRAMP - * TRAMP Words 5-7, Length 3 = SIGRETURN_TRAMP + * TRAMP Words 5-9, Length 4 = SIGRETURN_TRAMP * So the SIGRETURN_TRAMP is at the end of SIGRESTARTBLOCK_TRAMP */ rp = (unsigned long) &frame->tramp[SIGRESTARTBLOCK_TRAMP]; diff --git a/arch/parisc/kernel/signal32.h b/arch/parisc/kernel/signal32.h index a5bdbb5678b7..f166250f2d06 100644 --- a/arch/parisc/kernel/signal32.h +++ b/arch/parisc/kernel/signal32.h @@ -36,7 +36,7 @@ struct compat_regfile { compat_int_t rf_sar; }; -#define COMPAT_SIGRETURN_TRAMP 3 +#define COMPAT_SIGRETURN_TRAMP 4 #define COMPAT_SIGRESTARTBLOCK_TRAMP 5 #define COMPAT_TRAMP_SIZE (COMPAT_SIGRETURN_TRAMP + \ COMPAT_SIGRESTARTBLOCK_TRAMP) From 9412f5aaa86429e018941994076bd63a0618111c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 17 Nov 2021 15:48:45 +0100 Subject: [PATCH 275/336] parisc: Enable CONFIG_PRINTK_TIME=y in 32bit defconfig Signed-off-by: Helge Deller --- arch/parisc/configs/generic-32bit_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index d6fd8fa7ed8c..53061cb2cf7f 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -231,6 +231,7 @@ CONFIG_CRYPTO_DEFLATE=y CONFIG_CRC_CCITT=m CONFIG_CRC_T10DIF=y CONFIG_FONTS=y +CONFIG_PRINTK_TIME=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y CONFIG_DEBUG_MEMORY_INIT=y From a66998e0fbf213d47d02813b9679426129d0d114 Mon Sep 17 00:00:00 2001 From: Teng Qi Date: Wed, 17 Nov 2021 11:44:53 +0800 Subject: [PATCH 276/336] ethernet: hisilicon: hns: hns_dsaf_misc: fix a possible array overflow in hns_dsaf_ge_srst_by_port() The if statement: if (port >= DSAF_GE_NUM) return; limits the value of port less than DSAF_GE_NUM (i.e., 8). However, if the value of port is 6 or 7, an array overflow could occur: port_rst_off = dsaf_dev->mac_cb[port]->port_rst_off; because the length of dsaf_dev->mac_cb is DSAF_MAX_PORT_NUM (i.e., 6). To fix this possible array overflow, we first check port and if it is greater than or equal to DSAF_MAX_PORT_NUM, the function returns. Reported-by: TOTE Robot Signed-off-by: Teng Qi Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c index 23d9cbf262c3..740850b64aff 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c @@ -400,6 +400,10 @@ static void hns_dsaf_ge_srst_by_port(struct dsaf_device *dsaf_dev, u32 port, return; if (!HNS_DSAF_IS_DEBUG(dsaf_dev)) { + /* DSAF_MAX_PORT_NUM is 6, but DSAF_GE_NUM is 8. + We need check to prevent array overflow */ + if (port >= DSAF_MAX_PORT_NUM) + return; reg_val_1 = 0x1 << port; port_rst_off = dsaf_dev->mac_cb[port]->port_rst_off; /* there is difference between V1 and V2 in register.*/ From f915b75bffb7257bd8d26376b8e1cc67771927f8 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 17 Nov 2021 15:56:52 +0800 Subject: [PATCH 277/336] page_pool: Revert "page_pool: disable dma mapping support..." This reverts commit d00e60ee54b12de945b8493cf18c1ada9e422514. As reported by Guillaume in [1]: Enabling LPAE always enables CONFIG_ARCH_DMA_ADDR_T_64BIT in 32-bit systems, which breaks the bootup proceess when a ethernet driver is using page pool with PP_FLAG_DMA_MAP flag. As we were hoping we had no active consumers for such system when we removed the dma mapping support, and LPAE seems like a common feature for 32 bits system, so revert it. 1. https://www.spinics.net/lists/netdev/msg779890.html Fixes: d00e60ee54b1 ("page_pool: disable dma mapping support for 32-bit arch with 64-bit DMA") Signed-off-by: Yunsheng Lin Reported-by: "kernelci.org bot" Tested-by: "kernelci.org bot" Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/mm_types.h | 13 ++++++++++++- include/net/page_pool.h | 12 +++++++++++- net/core/page_pool.c | 10 ++++------ 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb8c6f5f19bc..c3a6e6209600 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -105,7 +105,18 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - atomic_long_t pp_frag_count; + union { + /** + * dma_addr_upper: might require a 64-bit + * value on 32-bit architectures. + */ + unsigned long dma_addr_upper; + /** + * For frag page support, not supported in + * 32-bit architectures with 64-bit DMA. + */ + atomic_long_t pp_frag_count; + }; }; struct { /* slab, slob and slub */ union { diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 3855f069627f..a4082406a003 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -216,14 +216,24 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT \ + (sizeof(dma_addr_t) > sizeof(unsigned long)) + static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { - return page->dma_addr; + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16; + + return ret; } static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { page->dma_addr = addr; + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT) + page->dma_addr_upper = upper_32_bits(addr); } static inline void page_pool_set_frag_count(struct page *page, long nr) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b60e4301a44..1a6978427d6c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -49,12 +49,6 @@ static int page_pool_init(struct page_pool *pool, * which is the XDP_TX use-case. */ if (pool->p.flags & PP_FLAG_DMA_MAP) { - /* DMA-mapping is not supported on 32-bit systems with - * 64-bit DMA mapping. - */ - if (sizeof(dma_addr_t) > sizeof(unsigned long)) - return -EOPNOTSUPP; - if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; @@ -75,6 +69,10 @@ static int page_pool_init(struct page_pool *pool, */ } + if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && + pool->p.flags & PP_FLAG_PAGE_FRAG) + return -EINVAL; + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) return -ENOMEM; From fec1faf221f61118aa52f44c65a13c3e173a64c2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Nov 2021 16:49:09 +0200 Subject: [PATCH 278/336] devlink: Don't throw an error if flash notification sent before devlink visible The mlxsw driver calls to various devlink flash routines even before users can get any access to the devlink instance itself. For example, mlxsw_core_fw_rev_validate() one of such functions. __mlxsw_core_bus_device_register -> mlxsw_core_fw_rev_validate -> mlxsw_core_fw_flash -> mlxfw_firmware_flash -> mlxfw_status_notify -> devlink_flash_update_status_notify -> __devlink_flash_update_notify -> WARN_ON(...) It causes to the WARN_ON to trigger warning about devlink not registered. Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Reported-by: Danielle Ratson Tested-by: Danielle Ratson Signed-off-by: Leon Romanovsky Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/devlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index 5ba4f9434acd..5ad72dbfcd07 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4229,7 +4229,9 @@ static void __devlink_flash_update_notify(struct devlink *devlink, WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && cmd != DEVLINK_CMD_FLASH_UPDATE_END && cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); - WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) From 5d2ca2e12dfb2aff3388ca57b06f570fa6206ced Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Wed, 17 Nov 2021 12:59:52 -0800 Subject: [PATCH 279/336] e100: fix device suspend/resume As reported in [1], e100 was no longer working for suspend/resume cycles. The previous commit mentioned in the fixes appears to have broken things and this attempts to practice best known methods for device power management and keep wake-up working while allowing suspend/resume to work. To do this, I reorder a little bit of code and fix the resume path to make sure the device is enabled. [1] https://bugzilla.kernel.org/show_bug.cgi?id=214933 Fixes: 69a74aef8a18 ("e100: use generic power management") Cc: Vaibhav Gupta Reported-by: Alexey Kuznetsov Signed-off-by: Jesse Brandeburg Tested-by: Alexey Kuznetsov Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/e100.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/e100.c b/drivers/net/ethernet/intel/e100.c index 5039a2536951..0bf3d47bb90d 100644 --- a/drivers/net/ethernet/intel/e100.c +++ b/drivers/net/ethernet/intel/e100.c @@ -3003,9 +3003,10 @@ static void __e100_shutdown(struct pci_dev *pdev, bool *enable_wake) struct net_device *netdev = pci_get_drvdata(pdev); struct nic *nic = netdev_priv(netdev); + netif_device_detach(netdev); + if (netif_running(netdev)) e100_down(nic); - netif_device_detach(netdev); if ((nic->flags & wol_magic) | e100_asf(nic)) { /* enable reverse auto-negotiation */ @@ -3022,7 +3023,7 @@ static void __e100_shutdown(struct pci_dev *pdev, bool *enable_wake) *enable_wake = false; } - pci_clear_master(pdev); + pci_disable_device(pdev); } static int __e100_power_off(struct pci_dev *pdev, bool wake) @@ -3042,8 +3043,6 @@ static int __maybe_unused e100_suspend(struct device *dev_d) __e100_shutdown(to_pci_dev(dev_d), &wake); - device_wakeup_disable(dev_d); - return 0; } @@ -3051,6 +3050,14 @@ static int __maybe_unused e100_resume(struct device *dev_d) { struct net_device *netdev = dev_get_drvdata(dev_d); struct nic *nic = netdev_priv(netdev); + int err; + + err = pci_enable_device(to_pci_dev(dev_d)); + if (err) { + netdev_err(netdev, "Resume cannot enable PCI device, aborting\n"); + return err; + } + pci_set_master(to_pci_dev(dev_d)); /* disable reverse auto-negotiation */ if (nic->phy == phy_82552_v) { @@ -3062,10 +3069,11 @@ static int __maybe_unused e100_resume(struct device *dev_d) smartspeed & ~(E100_82552_REV_ANEG)); } - netif_device_attach(netdev); if (netif_running(netdev)) e100_up(nic); + netif_device_attach(netdev); + return 0; } From 5f9c55c8066bcd93ac25234a02585701fe2e31df Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Wed, 17 Nov 2021 20:06:48 +0100 Subject: [PATCH 280/336] ipv6: check return value of ipv6_skip_exthdr The offset value is used in pointer math on skb->data. Since ipv6_skip_exthdr may return -1 the pointer to uh and th may not point to the actual udp and tcp headers and potentially overwrite other stuff. This is why I think this should be checked. EDIT: added {}'s, thanks Kees Signed-off-by: Jordy Zomer Signed-off-by: David S. Miller --- net/ipv6/esp6.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ed2f061b8768..f0bac6f7ab6b 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -808,6 +808,12 @@ int esp6_input_done2(struct sk_buff *skb, int err) struct tcphdr *th; offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); + + if (offset < 0) { + err = -EINVAL; + goto out; + } + uh = (void *)(skb->data + offset); th = (void *)(skb->data + offset); hdr_len += offset; From 61217be886b5f7402843677e4be7e7e83de9cb41 Mon Sep 17 00:00:00 2001 From: zhangyue Date: Thu, 18 Nov 2021 13:46:32 +0800 Subject: [PATCH 281/336] net: tulip: de4x5: fix the problem that the array 'lp->phy[8]' may be out of bound In line 5001, if all id in the array 'lp->phy[8]' is not 0, when the 'for' end, the 'k' is 8. At this time, the array 'lp->phy[8]' may be out of bound. Signed-off-by: zhangyue Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/de4x5.c | 30 +++++++++++++++----------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index 13121c4dcfe6..bb334042f8e1 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -5000,19 +5000,23 @@ mii_get_phy(struct net_device *dev) } if ((j == limit) && (i < DE4X5_MAX_MII)) { for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++); - lp->phy[k].addr = i; - lp->phy[k].id = id; - lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */ - lp->phy[k].spd.mask = GENERIC_MASK; /* 100Mb/s technologies */ - lp->phy[k].spd.value = GENERIC_VALUE; /* TX & T4, H/F Duplex */ - lp->mii_cnt++; - lp->active++; - printk("%s: Using generic MII device control. If the board doesn't operate,\nplease mail the following dump to the author:\n", dev->name); - j = de4x5_debug; - de4x5_debug |= DEBUG_MII; - de4x5_dbg_mii(dev, k); - de4x5_debug = j; - printk("\n"); + if (k < DE4X5_MAX_PHY) { + lp->phy[k].addr = i; + lp->phy[k].id = id; + lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */ + lp->phy[k].spd.mask = GENERIC_MASK; /* 100Mb/s technologies */ + lp->phy[k].spd.value = GENERIC_VALUE; /* TX & T4, H/F Duplex */ + lp->mii_cnt++; + lp->active++; + printk("%s: Using generic MII device control. If the board doesn't operate,\nplease mail the following dump to the author:\n", dev->name); + j = de4x5_debug; + de4x5_debug |= DEBUG_MII; + de4x5_dbg_mii(dev, k); + de4x5_debug = j; + printk("\n"); + } else { + goto purgatory; + } } } purgatory: From 0fa68da72c3be09e06dd833258ee89c33374195f Mon Sep 17 00:00:00 2001 From: Teng Qi Date: Thu, 18 Nov 2021 15:01:18 +0800 Subject: [PATCH 282/336] net: ethernet: dec: tulip: de4x5: fix possible array overflows in type3_infoblock() The definition of macro MOTO_SROM_BUG is: #define MOTO_SROM_BUG (lp->active == 8 && (get_unaligned_le32( dev->dev_addr) & 0x00ffffff) == 0x3e0008) and the if statement if (MOTO_SROM_BUG) lp->active = 0; using this macro indicates lp->active could be 8. If lp->active is 8 and the second comparison of this macro is false. lp->active will remain 8 in: lp->phy[lp->active].gep = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].rst = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].mc = get_unaligned_le16(p); p += 2; lp->phy[lp->active].ana = get_unaligned_le16(p); p += 2; lp->phy[lp->active].fdx = get_unaligned_le16(p); p += 2; lp->phy[lp->active].ttm = get_unaligned_le16(p); p += 2; lp->phy[lp->active].mci = *p; However, the length of array lp->phy is 8, so array overflows can occur. To fix these possible array overflows, we first check lp->active and then return -EINVAL if it is greater or equal to ARRAY_SIZE(lp->phy) (i.e. 8). Reported-by: TOTE Robot Signed-off-by: Teng Qi Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/dec/tulip/de4x5.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index bb334042f8e1..71730ef4cd57 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -4709,6 +4709,10 @@ type3_infoblock(struct net_device *dev, u_char count, u_char *p) lp->ibn = 3; lp->active = *p++; if (MOTO_SROM_BUG) lp->active = 0; + /* if (MOTO_SROM_BUG) statement indicates lp->active could + * be 8 (i.e. the size of array lp->phy) */ + if (WARN_ON(lp->active >= ARRAY_SIZE(lp->phy))) + return -EINVAL; lp->phy[lp->active].gep = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].rst = (*p ? p : NULL); p += (2 * (*p) + 1); lp->phy[lp->active].mc = get_unaligned_le16(p); p += 2; From c7521d3aa2fa7fc785682758c99b5bcae503f6be Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 18 Nov 2021 14:22:11 +0300 Subject: [PATCH 283/336] ptp: ocp: Fix a couple NULL vs IS_ERR() checks The ptp_ocp_get_mem() function does not return NULL, it returns error pointers. Fixes: 773bda964921 ("ptp: ocp: Expose various resources on the timecard.") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/ptp/ptp_ocp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 34f943c8c9fd..0f1b5a7d2a89 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -1304,10 +1304,11 @@ ptp_ocp_register_ext(struct ptp_ocp *bp, struct ocp_resource *r) if (!ext) return -ENOMEM; - err = -EINVAL; ext->mem = ptp_ocp_get_mem(bp, r); - if (!ext->mem) + if (IS_ERR(ext->mem)) { + err = PTR_ERR(ext->mem); goto out; + } ext->bp = bp; ext->info = r->extra; @@ -1371,8 +1372,8 @@ ptp_ocp_register_mem(struct ptp_ocp *bp, struct ocp_resource *r) void __iomem *mem; mem = ptp_ocp_get_mem(bp, r); - if (!mem) - return -EINVAL; + if (IS_ERR(mem)) + return PTR_ERR(mem); bp_assign_entry(bp, r, mem); From b075c1d81e7d0e96758877ff9ded30ab87df2b77 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 1 Jul 2021 13:39:15 -0300 Subject: [PATCH 284/336] tools headers cpufeatures: Sync with the kernel sources To pick the changes from: eec2113eabd92b7b ("x86/fpu/amx: Define AMX state components and have it used for boot-time checks") This only causes these perf files to be rebuilt: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Borislav Petkov Cc: Chang S. Bae Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index d0ce5cfd3ac1..d5b5f2ab87a0 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -277,6 +277,7 @@ #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -298,6 +299,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ From 346e91998cba46b64e8ef5f89813f8918c2731e6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 9 May 2021 09:39:02 -0300 Subject: [PATCH 285/336] tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: b56639318bb2be66 ("KVM: SEV: Add support for SEV intra host migration") e615e355894e6197 ("KVM: x86: On emulation failure, convey the exit reason, etc. to userspace") a9d496d8e08ca1eb ("KVM: x86: Clarify the kvm_run.emulation_failure structure layout") c68dc1b577eabd56 ("KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK") dea8ee31a0392775 ("RISC-V: KVM: Add SBI v0.1 support") That just rebuilds perf, as these patches don't add any new KVM ioctl to be harvested for the the 'perf trace' ioctl syscall argument beautifiers. This is also by now used by tools/testing/selftests/kvm/, a simple test build succeeded. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Anup Patel Cc: Atish Patra Cc: David Edmondson Cc: Oliver Upton Cc: Paolo Bonzini Cc: Peter Gonda Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index a067410ebea5..1daa45268de2 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -269,6 +269,7 @@ struct kvm_xen_exit { #define KVM_EXIT_AP_RESET_HOLD 32 #define KVM_EXIT_X86_BUS_LOCK 33 #define KVM_EXIT_XEN 34 +#define KVM_EXIT_RISCV_SBI 35 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -397,13 +398,23 @@ struct kvm_run { * "ndata" is correct, that new fields are enumerated in "flags", * and that each flag enumerates fields that are 64-bit aligned * and sized (so that ndata+internal.data[] is valid/accurate). + * + * Space beyond the defined fields may be used to store arbitrary + * debug information relating to the emulation failure. It is + * accounted for in "ndata" but the format is unspecified and is + * not represented in "flags". Any such information is *not* ABI! */ struct { __u32 suberror; __u32 ndata; __u64 flags; - __u8 insn_size; - __u8 insn_bytes[15]; + union { + struct { + __u8 insn_size; + __u8 insn_bytes[15]; + }; + }; + /* Arbitrary debug data may follow. */ } emulation_failure; /* KVM_EXIT_OSI */ struct { @@ -469,6 +480,13 @@ struct kvm_run { } msr; /* KVM_EXIT_XEN */ struct kvm_xen_exit xen; + /* KVM_EXIT_RISCV_SBI */ + struct { + unsigned long extension_id; + unsigned long function_id; + unsigned long args[6]; + unsigned long ret[2]; + } riscv_sbi; /* Fix the size of the union. */ char padding[256]; }; @@ -1112,6 +1130,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_BINARY_STATS_FD 203 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #define KVM_CAP_ARM_MTE 205 +#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #ifdef KVM_CAP_IRQ_ROUTING @@ -1223,11 +1242,16 @@ struct kvm_irqfd { /* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags. */ #define KVM_CLOCK_TSC_STABLE 2 +#define KVM_CLOCK_REALTIME (1 << 2) +#define KVM_CLOCK_HOST_TSC (1 << 3) struct kvm_clock_data { __u64 clock; __u32 flags; - __u32 pad[9]; + __u32 pad0; + __u64 realtime; + __u64 host_tsc; + __u32 pad[4]; }; /* For KVM_CAP_SW_TLB */ From ccb05590c4325ce50b1c0deedd54d5f24f4f1652 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Nov 2021 13:02:52 -0300 Subject: [PATCH 286/336] perf tests wp: Remove unused functions on s390 Fixing these build problems: tests/wp.c:24:12: error: 'wp_read' defined but not used [-Werror=unused-function] static int wp_read(int fd, long long *count, int size) ^ tests/wp.c:35:13: error: 'get__perf_event_attr' defined but not used [-Werror=unused-function] static void get__perf_event_attr(struct perf_event_attr *attr, int wp_type, ^ CC /tmp/build/perf/util/print_binary.o Fixes: e47c6ecaae1df54a ("perf test: Convert watch point tests to test cases.") Cc: Alexander Shishkin Cc: Brendan Higgins Cc: Daniel Latypov Cc: David Gow Cc: Ian Rogers Cc: Ingo Molnar Cc: Jin Yao Cc: Jiri Olsa Cc: John Garry Cc: Mark Rutland Cc: Namhyung Kim Cc: Paul Clarke Cc: Peter Zijlstra Cc: Sohaib Mohamed Cc: Stephane Eranian Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/wp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/wp.c b/tools/perf/tests/wp.c index 820d942b30c3..9d4c45184e71 100644 --- a/tools/perf/tests/wp.c +++ b/tools/perf/tests/wp.c @@ -21,6 +21,7 @@ do { \ volatile u64 data1; volatile u8 data2[3]; +#ifndef __s390x__ static int wp_read(int fd, long long *count, int size) { int ret = read(fd, count, size); @@ -48,7 +49,6 @@ static void get__perf_event_attr(struct perf_event_attr *attr, int wp_type, attr->exclude_hv = 1; } -#ifndef __s390x__ static int __event(int wp_type, void *wp_addr, unsigned long wp_len) { int fd; From 70f9c9b2df1dd12cf40862b2b31c7bf89e311066 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Nov 2021 14:44:17 -0300 Subject: [PATCH 287/336] perf tools: Set COMPAT_NEED_REALLOCARRAY for CONFIG_AUXTRACE=1 As it is being used in tools/perf/arch/arm64/util/arm-spe.c and the COMPAT_NEED_REALLOCARRAY was only being set when CORESIGHT=1 is set. Fixes: 56c31cdff7c2a640 ("perf arm-spe: Implement find_snapshot callback") Cc: Alexander Shishkin Cc: German Gomez Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Will Deacon Link: https://lore.kernel.org/all/YZT63mIc7iY01er3@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 07e65a061fd3..afd144725a0b 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -1010,6 +1010,9 @@ ifndef NO_AUXTRACE ifndef NO_AUXTRACE $(call detected,CONFIG_AUXTRACE) CFLAGS += -DHAVE_AUXTRACE_SUPPORT + ifeq ($(feature-reallocarray), 0) + CFLAGS += -DCOMPAT_NEED_REALLOCARRAY + endif endif endif From 784e8adda4cdb3e2510742023729851b6c08803c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Nov 2021 15:56:15 -0700 Subject: [PATCH 288/336] perf sort: Fix the 'weight' sort key behavior Currently, the 'weight' field in the perf sample has latency information for some instructions like in memory accesses. And perf tool has 'weight' and 'local_weight' sort keys to display the info. But it's somewhat confusing what it shows exactly. In my understanding, 'local_weight' shows a weight in a single sample, and (global) 'weight' shows a sum of the weights in the hist_entry. For example: $ perf mem record -t load dd if=/dev/zero of=/dev/null bs=4k count=1M $ perf report --stdio -n -s +local_weight ... # # Overhead Samples Command Shared Object Symbol Local Weight # ........ ....... ....... ................ ......................... ............ # 21.23% 313 dd [kernel.vmlinux] [k] lockref_get_not_zero 32 12.43% 183 dd [kernel.vmlinux] [k] lockref_get_not_zero 35 11.97% 159 dd [kernel.vmlinux] [k] lockref_get_not_zero 36 10.40% 141 dd [kernel.vmlinux] [k] lockref_put_return 32 7.63% 113 dd [kernel.vmlinux] [k] lockref_get_not_zero 33 6.37% 92 dd [kernel.vmlinux] [k] lockref_get_not_zero 34 6.15% 90 dd [kernel.vmlinux] [k] lockref_put_return 33 ... So let's look at the 'lockref_get_not_zero' symbols. The top entry shows that 313 samples were captured with 'local_weight' 32, so the total weight should be 313 x 32 = 10016. But it's not the case: $ perf report --stdio -n -s +local_weight,weight -S lockref_get_not_zero ... # # Overhead Samples Command Shared Object Local Weight Weight # ........ ....... ....... ................ ............ ...... # 1.36% 4 dd [kernel.vmlinux] 36 144 0.47% 4 dd [kernel.vmlinux] 37 148 0.42% 4 dd [kernel.vmlinux] 32 128 0.40% 4 dd [kernel.vmlinux] 34 136 0.35% 4 dd [kernel.vmlinux] 36 144 0.34% 4 dd [kernel.vmlinux] 35 140 0.30% 4 dd [kernel.vmlinux] 36 144 0.30% 4 dd [kernel.vmlinux] 34 136 0.30% 4 dd [kernel.vmlinux] 32 128 0.30% 4 dd [kernel.vmlinux] 32 128 ... With the 'weight' sort key, it's divided to 4 samples even with the same info ('comm', 'dso', 'sym' and 'local_weight'). I don't think this is what we want. I found this because of the way it aggregates the 'weight' value. Since it's not a period, we should not add them in the he->stat. Otherwise, two 32 'weight' entries will create a 64 'weight' entry. After that, new 32 'weight' samples don't have a matching entry so it'd create a new entry and make it a 64 'weight' entry again and again. Later, they will be merged into 128 'weight' entries during the hists__collapse_resort() with 4 samples, multiple times like above. Let's keep the weight and display it differently. For 'local_weight', it can show the weight as is, and for (global) 'weight' it can display the number multiplied by the number of samples. With this change, I can see the expected numbers. $ perf report --stdio -n -s +local_weight,weight -S lockref_get_not_zero ... # # Overhead Samples Command Shared Object Local Weight Weight # ........ ....... ....... ................ ............ ..... # 21.23% 313 dd [kernel.vmlinux] 32 10016 12.43% 183 dd [kernel.vmlinux] 35 6405 11.97% 159 dd [kernel.vmlinux] 36 5724 7.63% 113 dd [kernel.vmlinux] 33 3729 6.37% 92 dd [kernel.vmlinux] 34 3128 4.17% 59 dd [kernel.vmlinux] 37 2183 0.08% 1 dd [kernel.vmlinux] 269 269 0.08% 1 dd [kernel.vmlinux] 38 38 Reviewed-by: Athira Jajeev Signed-off-by: Namhyung Kim Tested-by: Athira Jajeev Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20211105225617.151364-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 14 +++++--------- tools/perf/util/sort.c | 24 +++++++----------------- tools/perf/util/sort.h | 2 +- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 65fe65ba03c2..4e9bd7b589b1 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -290,11 +290,9 @@ static long hist_time(unsigned long htime) } static void he_stat__add_period(struct he_stat *he_stat, u64 period, - u64 weight, u64 ins_lat, u64 p_stage_cyc) + u64 ins_lat, u64 p_stage_cyc) { - he_stat->period += period; - he_stat->weight += weight; he_stat->nr_events += 1; he_stat->ins_lat += ins_lat; he_stat->p_stage_cyc += p_stage_cyc; @@ -308,9 +306,8 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) dest->period_guest_sys += src->period_guest_sys; dest->period_guest_us += src->period_guest_us; dest->nr_events += src->nr_events; - dest->weight += src->weight; dest->ins_lat += src->ins_lat; - dest->p_stage_cyc += src->p_stage_cyc; + dest->p_stage_cyc += src->p_stage_cyc; } static void he_stat__decay(struct he_stat *he_stat) @@ -598,7 +595,6 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, struct hist_entry *he; int64_t cmp; u64 period = entry->stat.period; - u64 weight = entry->stat.weight; u64 ins_lat = entry->stat.ins_lat; u64 p_stage_cyc = entry->stat.p_stage_cyc; bool leftmost = true; @@ -619,11 +615,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, if (!cmp) { if (sample_self) { - he_stat__add_period(&he->stat, period, weight, ins_lat, p_stage_cyc); + he_stat__add_period(&he->stat, period, ins_lat, p_stage_cyc); hist_entry__add_callchain_period(he, period); } if (symbol_conf.cumulate_callchain) - he_stat__add_period(he->stat_acc, period, weight, ins_lat, p_stage_cyc); + he_stat__add_period(he->stat_acc, period, ins_lat, p_stage_cyc); /* * This mem info was allocated from sample__resolve_mem @@ -733,7 +729,6 @@ __hists__add_entry(struct hists *hists, .stat = { .nr_events = 1, .period = sample->period, - .weight = sample->weight, .ins_lat = sample->ins_lat, .p_stage_cyc = sample->p_stage_cyc, }, @@ -748,6 +743,7 @@ __hists__add_entry(struct hists *hists, .raw_size = sample->raw_size, .ops = ops, .time = hist_time(sample->time), + .weight = sample->weight, }, *he = hists__findnew_entry(hists, &entry, al, sample_self); if (!hists->has_callchains && he && he->callchain_size != 0) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 568a88c001c6..903f34fff27e 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1325,45 +1325,35 @@ struct sort_entry sort_mispredict = { .se_width_idx = HISTC_MISPREDICT, }; -static u64 he_weight(struct hist_entry *he) -{ - return he->stat.nr_events ? he->stat.weight / he->stat.nr_events : 0; -} - static int64_t -sort__local_weight_cmp(struct hist_entry *left, struct hist_entry *right) +sort__weight_cmp(struct hist_entry *left, struct hist_entry *right) { - return he_weight(left) - he_weight(right); + return left->weight - right->weight; } static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%-*llu", width, he_weight(he)); + return repsep_snprintf(bf, size, "%-*llu", width, he->weight); } struct sort_entry sort_local_weight = { .se_header = "Local Weight", - .se_cmp = sort__local_weight_cmp, + .se_cmp = sort__weight_cmp, .se_snprintf = hist_entry__local_weight_snprintf, .se_width_idx = HISTC_LOCAL_WEIGHT, }; -static int64_t -sort__global_weight_cmp(struct hist_entry *left, struct hist_entry *right) -{ - return left->stat.weight - right->stat.weight; -} - static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%-*llu", width, he->stat.weight); + return repsep_snprintf(bf, size, "%-*llu", width, + he->weight * he->stat.nr_events); } struct sort_entry sort_global_weight = { .se_header = "Weight", - .se_cmp = sort__global_weight_cmp, + .se_cmp = sort__weight_cmp, .se_snprintf = hist_entry__global_weight_snprintf, .se_width_idx = HISTC_GLOBAL_WEIGHT, }; diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index b67c469aba79..e18b79916f63 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -49,7 +49,6 @@ struct he_stat { u64 period_us; u64 period_guest_sys; u64 period_guest_us; - u64 weight; u64 ins_lat; u64 p_stage_cyc; u32 nr_events; @@ -109,6 +108,7 @@ struct hist_entry { s32 socket; s32 cpu; u64 code_page_size; + u64 weight; u8 cpumode; u8 depth; From 4d03c75363eeca861c843319a0e6f4426234ed6c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Nov 2021 15:56:16 -0700 Subject: [PATCH 289/336] perf sort: Fix the 'ins_lat' sort key behavior Handle 'ins_lat' (for instruction latency) and 'local_ins_lat' sort keys with the same rationale as for the 'weight' and 'local_weight', see the previous fix in this series for a full explanation. But I couldn't test it actually, so only build tested. Reviewed-by: Athira Jajeev Signed-off-by: Namhyung Kim Tested-by: Athira Jajeev Cc: Andi Kleen Cc: Athira Jajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20211105225617.151364-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 11 ++++------- tools/perf/util/sort.c | 24 +++++++----------------- tools/perf/util/sort.h | 2 +- 3 files changed, 12 insertions(+), 25 deletions(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 4e9bd7b589b1..54fe97dd191c 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -290,11 +290,10 @@ static long hist_time(unsigned long htime) } static void he_stat__add_period(struct he_stat *he_stat, u64 period, - u64 ins_lat, u64 p_stage_cyc) + u64 p_stage_cyc) { he_stat->period += period; he_stat->nr_events += 1; - he_stat->ins_lat += ins_lat; he_stat->p_stage_cyc += p_stage_cyc; } @@ -306,7 +305,6 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) dest->period_guest_sys += src->period_guest_sys; dest->period_guest_us += src->period_guest_us; dest->nr_events += src->nr_events; - dest->ins_lat += src->ins_lat; dest->p_stage_cyc += src->p_stage_cyc; } @@ -595,7 +593,6 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, struct hist_entry *he; int64_t cmp; u64 period = entry->stat.period; - u64 ins_lat = entry->stat.ins_lat; u64 p_stage_cyc = entry->stat.p_stage_cyc; bool leftmost = true; @@ -615,11 +612,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, if (!cmp) { if (sample_self) { - he_stat__add_period(&he->stat, period, ins_lat, p_stage_cyc); + he_stat__add_period(&he->stat, period, p_stage_cyc); hist_entry__add_callchain_period(he, period); } if (symbol_conf.cumulate_callchain) - he_stat__add_period(he->stat_acc, period, ins_lat, p_stage_cyc); + he_stat__add_period(he->stat_acc, period, p_stage_cyc); /* * This mem info was allocated from sample__resolve_mem @@ -729,7 +726,6 @@ __hists__add_entry(struct hists *hists, .stat = { .nr_events = 1, .period = sample->period, - .ins_lat = sample->ins_lat, .p_stage_cyc = sample->p_stage_cyc, }, .parent = sym_parent, @@ -744,6 +740,7 @@ __hists__add_entry(struct hists *hists, .ops = ops, .time = hist_time(sample->time), .weight = sample->weight, + .ins_lat = sample->ins_lat, }, *he = hists__findnew_entry(hists, &entry, al, sample_self); if (!hists->has_callchains && he && he->callchain_size != 0) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 903f34fff27e..adc0584695d6 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1358,45 +1358,35 @@ struct sort_entry sort_global_weight = { .se_width_idx = HISTC_GLOBAL_WEIGHT, }; -static u64 he_ins_lat(struct hist_entry *he) -{ - return he->stat.nr_events ? he->stat.ins_lat / he->stat.nr_events : 0; -} - static int64_t -sort__local_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right) +sort__ins_lat_cmp(struct hist_entry *left, struct hist_entry *right) { - return he_ins_lat(left) - he_ins_lat(right); + return left->ins_lat - right->ins_lat; } static int hist_entry__local_ins_lat_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%-*u", width, he_ins_lat(he)); + return repsep_snprintf(bf, size, "%-*u", width, he->ins_lat); } struct sort_entry sort_local_ins_lat = { .se_header = "Local INSTR Latency", - .se_cmp = sort__local_ins_lat_cmp, + .se_cmp = sort__ins_lat_cmp, .se_snprintf = hist_entry__local_ins_lat_snprintf, .se_width_idx = HISTC_LOCAL_INS_LAT, }; -static int64_t -sort__global_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right) -{ - return left->stat.ins_lat - right->stat.ins_lat; -} - static int hist_entry__global_ins_lat_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%-*u", width, he->stat.ins_lat); + return repsep_snprintf(bf, size, "%-*u", width, + he->ins_lat * he->stat.nr_events); } struct sort_entry sort_global_ins_lat = { .se_header = "INSTR Latency", - .se_cmp = sort__global_ins_lat_cmp, + .se_cmp = sort__ins_lat_cmp, .se_snprintf = hist_entry__global_ins_lat_snprintf, .se_width_idx = HISTC_GLOBAL_INS_LAT, }; diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index e18b79916f63..22ae7c6ae398 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -49,7 +49,6 @@ struct he_stat { u64 period_us; u64 period_guest_sys; u64 period_guest_us; - u64 ins_lat; u64 p_stage_cyc; u32 nr_events; }; @@ -109,6 +108,7 @@ struct hist_entry { s32 cpu; u64 code_page_size; u64 weight; + u64 ins_lat; u8 cpumode; u8 depth; From db4b284029099224f387d75198e5995df1cb8aef Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 5 Nov 2021 15:56:17 -0700 Subject: [PATCH 290/336] perf sort: Fix the 'p_stage_cyc' sort key behavior andle 'p_stage_cyc' (for pipeline stage cycles) sort key with the same rationale as for the 'weight' and 'local_weight', see the fix in this series for a full explanation. Not sure it also needs the local and global variants. But I couldn't test it actually because I don't have the machine. Reviewed-by: Athira Jajeev Signed-off-by: Namhyung Kim Tested-by: Athira Jajeev Cc: Andi Kleen Cc: Athira Jajeev Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20211105225617.151364-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 12 ++++-------- tools/perf/util/sort.c | 4 ++-- tools/perf/util/sort.h | 2 +- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 54fe97dd191c..b776465e04ef 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -289,12 +289,10 @@ static long hist_time(unsigned long htime) return htime; } -static void he_stat__add_period(struct he_stat *he_stat, u64 period, - u64 p_stage_cyc) +static void he_stat__add_period(struct he_stat *he_stat, u64 period) { he_stat->period += period; he_stat->nr_events += 1; - he_stat->p_stage_cyc += p_stage_cyc; } static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) @@ -305,7 +303,6 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) dest->period_guest_sys += src->period_guest_sys; dest->period_guest_us += src->period_guest_us; dest->nr_events += src->nr_events; - dest->p_stage_cyc += src->p_stage_cyc; } static void he_stat__decay(struct he_stat *he_stat) @@ -593,7 +590,6 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, struct hist_entry *he; int64_t cmp; u64 period = entry->stat.period; - u64 p_stage_cyc = entry->stat.p_stage_cyc; bool leftmost = true; p = &hists->entries_in->rb_root.rb_node; @@ -612,11 +608,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, if (!cmp) { if (sample_self) { - he_stat__add_period(&he->stat, period, p_stage_cyc); + he_stat__add_period(&he->stat, period); hist_entry__add_callchain_period(he, period); } if (symbol_conf.cumulate_callchain) - he_stat__add_period(he->stat_acc, period, p_stage_cyc); + he_stat__add_period(he->stat_acc, period); /* * This mem info was allocated from sample__resolve_mem @@ -726,7 +722,6 @@ __hists__add_entry(struct hists *hists, .stat = { .nr_events = 1, .period = sample->period, - .p_stage_cyc = sample->p_stage_cyc, }, .parent = sym_parent, .filtered = symbol__parent_filter(sym_parent) | al->filtered, @@ -741,6 +736,7 @@ __hists__add_entry(struct hists *hists, .time = hist_time(sample->time), .weight = sample->weight, .ins_lat = sample->ins_lat, + .p_stage_cyc = sample->p_stage_cyc, }, *he = hists__findnew_entry(hists, &entry, al, sample_self); if (!hists->has_callchains && he && he->callchain_size != 0) diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index adc0584695d6..a111065b484e 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1394,13 +1394,13 @@ struct sort_entry sort_global_ins_lat = { static int64_t sort__global_p_stage_cyc_cmp(struct hist_entry *left, struct hist_entry *right) { - return left->stat.p_stage_cyc - right->stat.p_stage_cyc; + return left->p_stage_cyc - right->p_stage_cyc; } static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%-*u", width, he->stat.p_stage_cyc); + return repsep_snprintf(bf, size, "%-*u", width, he->p_stage_cyc); } struct sort_entry sort_p_stage_cyc = { diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 22ae7c6ae398..7b7145501933 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -49,7 +49,6 @@ struct he_stat { u64 period_us; u64 period_guest_sys; u64 period_guest_us; - u64 p_stage_cyc; u32 nr_events; }; @@ -109,6 +108,7 @@ struct hist_entry { u64 code_page_size; u64 weight; u64 ins_lat; + u64 p_stage_cyc; u8 cpumode; u8 depth; From 162b944598344fd72800d453885979f06ca263f3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 10 Sep 2021 11:46:54 -0300 Subject: [PATCH 291/336] tools headers UAPI: Sync x86's asm/kvm.h with the kernel sources To pick the changes in: 828ca89628bfcb1b ("KVM: x86: Expose TSC offset controls to userspace") That just rebuilds kvm-stat.c on x86, no change in functionality. This silences these perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/kvm.h' differs from latest version at 'arch/x86/include/uapi/asm/kvm.h' diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Cc: Oliver Upton Cc: Paolo Bonzini Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/kvm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 2ef1f6513c68..5a776a08f78c 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -504,4 +504,8 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ +#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ +#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ + #endif /* _ASM_X86_KVM_H */ From cb5a63feae2d963cac7b687e6598d620bed13507 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 17 Nov 2021 09:26:09 +0100 Subject: [PATCH 292/336] perf test sample-parsing: Fix branch_stack entry endianness check Commit 10269a2ca2b08cbd ("perf test sample-parsing: Add endian test for struct branch_flags") broke the test case 27 (Sample parsing) on s390 on linux-next tree: # perf test -Fv 27 27: Sample parsing --- start --- parsing failed for sample_type 0x800 ---- end ---- Sample parsing: FAILED! # The cause of the failure is a wrong #define BS_EXPECTED_BE statement in above commit. Correct this define and the test case runs fine. Output After: # perf test -Fv 27 27: Sample parsing : --- start --- ---- end ---- Sample parsing: Ok # Fixes: 10269a2ca2b08c ("perf test sample-parsing: Add endian test for struct branch_flags") Signed-off-by: Thomas Richter Tested-by: Madhavan Srinivasan Acked-by: Madhavan Srinivasan CC: Sven Schnelle Cc: Heiko Carstens Cc: Jiri Olsa Cc: Sumanth Korikkar Cc: Vasily Gorbik Link: https://lore.kernel.org/r/54077e81-503e-3405-6cb0-6541eb5532cc@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/sample-parsing.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index b669d22f2b13..07f2411b0ad4 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -36,7 +36,7 @@ * These are based on the input value (213) specified * in branch_stack variable. */ -#define BS_EXPECTED_BE 0xa00d000000000000 +#define BS_EXPECTED_BE 0xa000d00000000000 #define BS_EXPECTED_LE 0xd5000000 #define FLAG(s) s->branch_stack->entries[i].flags From 92723ea0f11d92496687db8c9725248e9d1e5e1d Mon Sep 17 00:00:00 2001 From: Sohaib Mohamed Date: Wed, 10 Nov 2021 04:20:11 +0200 Subject: [PATCH 293/336] perf bench: Fix two memory leaks detected with ASan ASan reports memory leaks while running: $ perf bench sched all Fixes: e27454cc6352c422 ("perf bench: Add sched-messaging.c: Benchmark for scheduler and IPC mechanisms based on hackbench") Signed-off-by: Sohaib Mohamed Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Hitoshi Mitake Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Paul Russel Cc: Peter Zijlstra Cc: Pierre Gondois Link: http://lore.kernel.org/lkml/20211110022012.16620-1-sohaib.amhmd@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/sched-messaging.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index 488f6e6ba1a5..fa0ff4ce2b74 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -223,6 +223,8 @@ static unsigned int group(pthread_t *pth, snd_ctx->out_fds[i] = fds[1]; if (!thread_mode) close(fds[0]); + + free(ctx); } /* Now we have all the fds, fork the senders */ @@ -239,6 +241,8 @@ static unsigned int group(pthread_t *pth, for (i = 0; i < num_fds; i++) close(snd_ctx->out_fds[i]); + free(snd_ctx); + /* Return number of children to reap */ return num_fds * 2; } From 9e1a8d9f683260d50e0a14176d3f7c46a93b2700 Mon Sep 17 00:00:00 2001 From: German Gomez Date: Fri, 5 Nov 2021 10:41:30 +0000 Subject: [PATCH 294/336] perf inject: Fix ARM SPE handling 'perf inject' is currently not working for Arm SPE. When you try to run 'perf inject' and 'perf report' with a perf.data file that contains SPE traces, the tool reports a "Bad address" error: # ./perf record -e arm_spe_0/ts_enable=1,store_filter=1,branch_filter=1,load_filter=1/ -a -- sleep 1 # ./perf inject -i perf.data -o perf.inject.data --itrace # ./perf report -i perf.inject.data --stdio 0x42c00 [0x8]: failed to process type: 9 [Bad address] Error: failed to process sample As far as I know, the issue was first spotted in [1], but 'perf inject' was not yet injecting the samples. This patch does something similar to what cs_etm does for injecting the samples [2], but for SPE. [1] https://patchwork.kernel.org/project/linux-arm-kernel/cover/20210412091006.468557-1-leo.yan@linaro.org/#24117339 [2] https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/tools/perf/util/cs-etm.c?h=perf/core&id=133fe2e617e48ca0948983329f43877064ffda3e#n1196 Reviewed-by: James Clark Signed-off-by: German Gomez Cc: Alexander Shishkin Cc: Jiri Olsa Cc: John Garry Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Namhyung Kim Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20211105104130.28186-2-german.gomez@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/arm-spe.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 4748bcfe61de..fccac06b573a 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -51,6 +51,7 @@ struct arm_spe { u8 timeless_decoding; u8 data_queued; + u64 sample_type; u8 sample_flc; u8 sample_llc; u8 sample_tlb; @@ -287,6 +288,12 @@ static void arm_spe_prep_sample(struct arm_spe *spe, event->sample.header.size = sizeof(struct perf_event_header); } +static int arm_spe__inject_event(union perf_event *event, struct perf_sample *sample, u64 type) +{ + event->header.size = perf_event__sample_event_size(sample, type, 0); + return perf_event__synthesize_sample(event, type, 0, sample); +} + static inline int arm_spe_deliver_synth_event(struct arm_spe *spe, struct arm_spe_queue *speq __maybe_unused, @@ -295,6 +302,12 @@ arm_spe_deliver_synth_event(struct arm_spe *spe, { int ret; + if (spe->synth_opts.inject) { + ret = arm_spe__inject_event(event, sample, spe->sample_type); + if (ret) + return ret; + } + ret = perf_session__deliver_synth_event(spe->session, event, sample); if (ret) pr_err("ARM SPE: failed to deliver event, error %d\n", ret); @@ -986,6 +999,8 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) else attr.sample_type |= PERF_SAMPLE_TIME; + spe->sample_type = attr.sample_type; + attr.exclude_user = evsel->core.attr.exclude_user; attr.exclude_kernel = evsel->core.attr.exclude_kernel; attr.exclude_hv = evsel->core.attr.exclude_hv; From e8c04ea0fef5731dbcaabac86d65254c227aedf4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 17 Nov 2021 14:29:31 -0300 Subject: [PATCH 295/336] tools build: Fix removal of feature-sync-compare-and-swap feature detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch removing the feature-sync-compare-and-swap feature detection didn't remove the call to main_test_sync_compare_and_swap(), making the 'test-all' case fail an all the feature tests to be performed individually: $ cat /tmp/build/perf/feature/test-all.make.output In file included from test-all.c:18: test-libpython-version.c:5:10: error: #error 5 | #error | ^~~~~ test-all.c: In function ‘main’: test-all.c:203:9: error: implicit declaration of function ‘main_test_sync_compare_and_swap’ [-Werror=implicit-function-declaration] 203 | main_test_sync_compare_and_swap(argc, argv); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors $ Fix it, now to figure out what is that test-libpython-version.c problem... Fixes: 60fa754b2a5a4e0c ("tools: Remove feature-sync-compare-and-swap feature detection") Cc: Jiri Olsa Cc: Leo Yan Cc: Mark Rutland Cc: Mathieu Poirier Cc: Mike Leach Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/YZU9Fe0sgkHSXeC2@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/test-all.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index 920439527291..0b243ce842be 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -200,7 +200,6 @@ int main(int argc, char *argv[]) main_test_timerfd(); main_test_stackprotector_all(); main_test_libdw_dwarf_unwind(); - main_test_sync_compare_and_swap(argc, argv); main_test_zlib(); main_test_pthread_attr_setaffinity_np(); main_test_pthread_barrier(); From 8b8dcc3720d57d88faa74e0da2fb419da953ddd0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 8 Sep 2021 16:09:08 -0300 Subject: [PATCH 296/336] tools headers UAPI: Sync MIPS syscall table file changed by new futex_waitv syscall To pick the changes in these csets: b3ff2881ba18b852 ("MIPS: syscalls: Wire up futex_waitv syscall") That add support for this new syscall in tools such as 'perf trace'. For instance, this is now possible (adapted from the x86_64 test output): # perf trace -e futex_waitv ^C# # perf trace -v -e futex_waitv event qualifier tracepoint filter: (common_pid != 807333 && common_pid != 3564) && (id == 449) ^C# # perf trace -v -e futex* --max-events 10 event qualifier tracepoint filter: (common_pid != 812168 && common_pid != 3564) && (id == 202 || id == 449) mmap size 528384B ? ( ): Timer/219310 ... [continued]: futex()) = -1 ETIMEDOUT (Connection timed out) 0.012 ( 0.002 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1) = 0 0.024 ( 0.060 ms): Timer/219310 futex(uaddr: 0x7fd0b152d420, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) = 0 0.086 ( 0.001 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1) = 0 0.088 ( ): Timer/219310 futex(uaddr: 0x7fd0b152d424, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) ... 0.075 ( 0.005 ms): Web Content/219299 futex(uaddr: 0x7fd0b152d420, op: WAKE|PRIVATE_FLAG, val: 1) = 1 0.169 ( 0.004 ms): Web Content/219299 futex(uaddr: 0x7fd0b152d424, op: WAKE|PRIVATE_FLAG, val: 1) = 1 0.088 ( 0.089 ms): Timer/219310 ... [continued]: futex()) = 0 0.179 ( 0.001 ms): Timer/219310 futex(uaddr: 0x7fd0b152d3c8, op: WAKE|PRIVATE_FLAG, val: 1) = 0 0.181 ( ): Timer/219310 futex(uaddr: 0x7fd0b152d420, op: WAIT_BITSET|PRIVATE_FLAG, utime: 0x7fd0b1657840, val3: MATCH_ANY) ... # That is the filter expression attached to the raw_syscalls:sys_{enter,exit} tracepoints. $ grep futex_waitv tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl 449 n64 futex_waitv sys_futex_waitv $ This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl' diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Thomas Bogendoerfer Cc: Wang Haojun Link: https://lore.kernel.org/lkml/YZZRxuIyvSGLZhM4@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 1ca7bc337932..e2c481fcede6 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -363,3 +363,4 @@ 446 n64 landlock_restrict_self sys_landlock_restrict_self # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease +449 n64 futex_waitv sys_futex_waitv From 0ca1f534a776cc7d42f2c33da4732b74ec2790cd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 Nov 2021 23:12:47 -0800 Subject: [PATCH 297/336] perf hist: Fix memory leak of a perf_hpp_fmt perf_hpp__column_unregister() removes an entry from a list but doesn't free the memory causing a memory leak spotted by leak sanitizer. Add the free while at the same time reducing the scope of the function to static. Signed-off-by: Ian Rogers Reviewed-by: Kajol Jain Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20211118071247.2140392-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/hist.c | 28 ++++++++++++++-------------- tools/perf/util/hist.h | 1 - 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c index c1f24d004852..5075ecead5f3 100644 --- a/tools/perf/ui/hist.c +++ b/tools/perf/ui/hist.c @@ -535,6 +535,18 @@ struct perf_hpp_list perf_hpp_list = { #undef __HPP_SORT_ACC_FN #undef __HPP_SORT_RAW_FN +static void fmt_free(struct perf_hpp_fmt *fmt) +{ + /* + * At this point fmt should be completely + * unhooked, if not it's a bug. + */ + BUG_ON(!list_empty(&fmt->list)); + BUG_ON(!list_empty(&fmt->sort_list)); + + if (fmt->free) + fmt->free(fmt); +} void perf_hpp__init(void) { @@ -598,9 +610,10 @@ void perf_hpp_list__prepend_sort_field(struct perf_hpp_list *list, list_add(&format->sort_list, &list->sorts); } -void perf_hpp__column_unregister(struct perf_hpp_fmt *format) +static void perf_hpp__column_unregister(struct perf_hpp_fmt *format) { list_del_init(&format->list); + fmt_free(format); } void perf_hpp__cancel_cumulate(void) @@ -672,19 +685,6 @@ next: } -static void fmt_free(struct perf_hpp_fmt *fmt) -{ - /* - * At this point fmt should be completely - * unhooked, if not it's a bug. - */ - BUG_ON(!list_empty(&fmt->list)); - BUG_ON(!list_empty(&fmt->sort_list)); - - if (fmt->free) - fmt->free(fmt); -} - void perf_hpp__reset_output_field(struct perf_hpp_list *list) { struct perf_hpp_fmt *fmt, *tmp; diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 5343b62476e6..621f35ae1efa 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -369,7 +369,6 @@ enum { }; void perf_hpp__init(void); -void perf_hpp__column_unregister(struct perf_hpp_fmt *format); void perf_hpp__cancel_cumulate(void); void perf_hpp__setup_output_field(struct perf_hpp_list *list); void perf_hpp__reset_output_field(struct perf_hpp_list *list); From d9fc706108c15f8bc2d4ccccf8e50f74830fabd9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Wed, 17 Nov 2021 23:38:04 -0800 Subject: [PATCH 298/336] perf report: Fix memory leaks around perf_tip() perf_tip() may allocate memory or use a literal, this means memory wasn't freed if allocated. Change the API so that literals aren't used. At the same time add missing frees for system_path. These issues were spotted using leak sanitizer. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20211118073804.2149974-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-report.c | 15 +++++++++------ tools/perf/util/util.c | 14 +++++++------- tools/perf/util/util.h | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 8167ebfe776a..8ae400429870 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -619,14 +619,17 @@ static int report__browse_hists(struct report *rep) int ret; struct perf_session *session = rep->session; struct evlist *evlist = session->evlist; - const char *help = perf_tip(system_path(TIPDIR)); + char *help = NULL, *path = NULL; - if (help == NULL) { + path = system_path(TIPDIR); + if (perf_tip(&help, path) || help == NULL) { /* fallback for people who don't install perf ;-) */ - help = perf_tip(DOCDIR); - if (help == NULL) - help = "Cannot load tips.txt file, please install perf!"; + free(path); + path = system_path(DOCDIR); + if (perf_tip(&help, path) || help == NULL) + help = strdup("Cannot load tips.txt file, please install perf!"); } + free(path); switch (use_browser) { case 1: @@ -651,7 +654,7 @@ static int report__browse_hists(struct report *rep) ret = evlist__tty_browse_hists(evlist, rep, help); break; } - + free(help); return ret; } diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 37a9492edb3e..df3c4671be72 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -379,32 +379,32 @@ fetch_kernel_version(unsigned int *puint, char *str, return 0; } -const char *perf_tip(const char *dirpath) +int perf_tip(char **strp, const char *dirpath) { struct strlist *tips; struct str_node *node; - char *tip = NULL; struct strlist_config conf = { .dirname = dirpath, .file_only = true, }; + int ret = 0; + *strp = NULL; tips = strlist__new("tips.txt", &conf); if (tips == NULL) - return errno == ENOENT ? NULL : - "Tip: check path of tips.txt or get more memory! ;-p"; + return -errno; if (strlist__nr_entries(tips) == 0) goto out; node = strlist__entry(tips, random() % strlist__nr_entries(tips)); - if (asprintf(&tip, "Tip: %s", node->s) < 0) - tip = (char *)"Tip: get more memory! ;-)"; + if (asprintf(strp, "Tip: %s", node->s) < 0) + ret = -ENOMEM; out: strlist__delete(tips); - return tip; + return ret; } char *perf_exe(char *buf, int len) diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index ad737052e597..9f0d36ba77f2 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -39,7 +39,7 @@ int fetch_kernel_version(unsigned int *puint, #define KVER_FMT "%d.%d.%d" #define KVER_PARAM(x) KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x) -const char *perf_tip(const char *dirpath); +int perf_tip(char **strp, const char *dirpath); #ifndef HAVE_SCHED_GETCPU_SUPPORT int sched_getcpu(void); From b194c9cd09dd98af76beaa32a041af674260d730 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 18 Nov 2021 00:47:49 -0800 Subject: [PATCH 299/336] perf evsel: Fix memory leaks relating to unit unit may have a strdup pointer or be to a literal, consequently memory assocciated with it isn't freed. Change it so the unit is always strdup and so the memory can be safely freed. Fix related issue in perf_event__process_event_update() for name and own_cpus. Leaks were spotted by leak sanitizer. Signed-off-by: Ian Rogers Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20211118084749.2191447-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/event_update.c | 5 ++--- tools/perf/util/evsel.c | 18 +++++++++--------- tools/perf/util/header.c | 8 +++++--- tools/perf/util/parse-events.c | 9 ++++++--- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index fbb68deba59f..d01532d40acb 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -88,7 +88,6 @@ static int test__event_update(struct test_suite *test __maybe_unused, int subtes struct evsel *evsel; struct event_name tmp; struct evlist *evlist = evlist__new_default(); - char *unit = strdup("KRAVA"); TEST_ASSERT_VAL("failed to get evlist", evlist); @@ -99,7 +98,8 @@ static int test__event_update(struct test_suite *test __maybe_unused, int subtes perf_evlist__id_add(&evlist->core, &evsel->core, 0, 0, 123); - evsel->unit = unit; + free((char *)evsel->unit); + evsel->unit = strdup("KRAVA"); TEST_ASSERT_VAL("failed to synthesize attr update unit", !perf_event__synthesize_event_update_unit(NULL, evsel, process_event_unit)); @@ -119,7 +119,6 @@ static int test__event_update(struct test_suite *test __maybe_unused, int subtes TEST_ASSERT_VAL("failed to synthesize attr update cpus", !perf_event__synthesize_event_update_cpus(&tmp.tool, evsel, process_event_cpus)); - free(unit); evlist__delete(evlist); return 0; } diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index a59fb2ecb84e..ac0127be0459 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -241,7 +241,7 @@ void evsel__init(struct evsel *evsel, { perf_evsel__init(&evsel->core, attr, idx); evsel->tracking = !idx; - evsel->unit = ""; + evsel->unit = strdup(""); evsel->scale = 1.0; evsel->max_events = ULONG_MAX; evsel->evlist = NULL; @@ -276,13 +276,8 @@ struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx) } if (evsel__is_clock(evsel)) { - /* - * The evsel->unit points to static alias->unit - * so it's ok to use static string in here. - */ - static const char *unit = "msec"; - - evsel->unit = unit; + free((char *)evsel->unit); + evsel->unit = strdup("msec"); evsel->scale = 1e-6; } @@ -420,7 +415,11 @@ struct evsel *evsel__clone(struct evsel *orig) evsel->max_events = orig->max_events; evsel->tool_event = orig->tool_event; - evsel->unit = orig->unit; + free((char *)evsel->unit); + evsel->unit = strdup(orig->unit); + if (evsel->unit == NULL) + goto out_err; + evsel->scale = orig->scale; evsel->snapshot = orig->snapshot; evsel->per_pkg = orig->per_pkg; @@ -1441,6 +1440,7 @@ void evsel__exit(struct evsel *evsel) zfree(&evsel->group_name); zfree(&evsel->name); zfree(&evsel->pmu_name); + zfree(&evsel->unit); zfree(&evsel->metric_id); evsel__zero_per_pkg(evsel); hashmap__free(evsel->per_pkg_mask); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index fda8d14c891f..79cce216727e 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -4257,9 +4257,11 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused, switch (ev->type) { case PERF_EVENT_UPDATE__UNIT: + free((char *)evsel->unit); evsel->unit = strdup(ev->data); break; case PERF_EVENT_UPDATE__NAME: + free(evsel->name); evsel->name = strdup(ev->data); break; case PERF_EVENT_UPDATE__SCALE: @@ -4268,11 +4270,11 @@ int perf_event__process_event_update(struct perf_tool *tool __maybe_unused, break; case PERF_EVENT_UPDATE__CPUS: ev_cpus = (struct perf_record_event_update_cpus *)ev->data; - map = cpu_map__new_data(&ev_cpus->cpus); - if (map) + if (map) { + perf_cpu_map__put(evsel->core.own_cpus); evsel->core.own_cpus = map; - else + } else pr_err("failed to get event_update cpus\n"); default: break; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 5bfb6f892489..ba74fdf74af9 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -402,8 +402,10 @@ static int add_event_tool(struct list_head *list, int *idx, if (!evsel) return -ENOMEM; evsel->tool_event = tool_event; - if (tool_event == PERF_TOOL_DURATION_TIME) - evsel->unit = "ns"; + if (tool_event == PERF_TOOL_DURATION_TIME) { + free((char *)evsel->unit); + evsel->unit = strdup("ns"); + } return 0; } @@ -1630,7 +1632,8 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (parse_state->fake_pmu) return 0; - evsel->unit = info.unit; + free((char *)evsel->unit); + evsel->unit = strdup(info.unit); evsel->scale = info.scale; evsel->per_pkg = info.per_pkg; evsel->snapshot = info.snapshot; From 3b90954419d4c05651de9cce6d7632bcf6977678 Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Mon, 15 Nov 2021 07:40:25 +0100 Subject: [PATCH 300/336] s390/dump: fix copying to user-space of swapped kdump oldmem This commit fixes a bug introduced by commit e9e7870f90e3 ("s390/dump: introduce boot data 'oldmem_data'"). OLDMEM_BASE was mistakenly replaced by oldmem_data.size instead of oldmem_data.start. This bug caused the following error during kdump: kdump.sh[878]: No program header covering vaddr 0x3434f5245found kexec bug? Fixes: e9e7870f90e3 ("s390/dump: introduce boot data 'oldmem_data'") Cc: stable@vger.kernel.org # 5.15+ Signed-off-by: Alexander Egorenkov Reviewed-by: Marc Hartmayer Reviewed-by: Heiko Carstens Signed-off-by: Heiko Carstens --- arch/s390/kernel/crash_dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d72a6df058d7..785d54c9350c 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -191,8 +191,8 @@ static int copy_oldmem_user(void __user *dst, void *src, size_t count) return rc; } else { /* Check for swapped kdump oldmem areas */ - if (oldmem_data.start && from - oldmem_data.size < oldmem_data.size) { - from -= oldmem_data.size; + if (oldmem_data.start && from - oldmem_data.start < oldmem_data.size) { + from -= oldmem_data.start; len = min(count, oldmem_data.size - from); } else if (oldmem_data.start && from < oldmem_data.size) { len = min(count, oldmem_data.size - from); From 20c76e242e7025bd355619ba67beb243ba1a1e95 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Nov 2021 11:06:38 +0100 Subject: [PATCH 301/336] s390/kexec: fix return code handling kexec_file_add_ipl_report ignores that ipl_report_finish may fail and can return an error pointer instead of a valid pointer. Fix this and simplify by returning NULL in case of an error and let the only caller handle this case. Fixes: 99feaa717e55 ("s390/kexec_file: Create ipl report and pass to next kernel") Signed-off-by: Heiko Carstens --- arch/s390/kernel/ipl.c | 3 ++- arch/s390/kernel/machine_kexec_file.c | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index e2cc35775b99..5ad1dde23dc5 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2156,7 +2156,7 @@ void *ipl_report_finish(struct ipl_report *report) buf = vzalloc(report->size); if (!buf) - return ERR_PTR(-ENOMEM); + goto out; ptr = buf; memcpy(ptr, report->ipib, report->ipib->hdr.len); @@ -2195,6 +2195,7 @@ void *ipl_report_finish(struct ipl_report *report) } BUG_ON(ptr > buf + report->size); +out: return buf; } diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 528edff085d9..f0200b503f94 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -170,6 +170,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, struct kexec_buf buf; unsigned long addr; void *ptr, *end; + int ret; buf.image = image; @@ -199,7 +200,10 @@ static int kexec_file_add_ipl_report(struct kimage *image, ptr += len; } + ret = -ENOMEM; buf.buffer = ipl_report_finish(data->report); + if (!buf.buffer) + goto out; buf.bufsz = data->report->size; buf.memsz = buf.bufsz; @@ -209,7 +213,9 @@ static int kexec_file_add_ipl_report(struct kimage *image, data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); *lc_ipl_parmblock_ptr = (__u32)buf.mem; - return kexec_add_buffer(&buf); + ret = kexec_add_buffer(&buf); +out: + return ret; } void *kexec_file_add_components(struct kimage *image, From 4aa9340584e37debef06fa99b56d064beb723891 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 16 Nov 2021 11:31:01 +0800 Subject: [PATCH 302/336] s390/kexec: fix memory leak of ipl report buffer unreferenced object 0x38000195000 (size 4096): comm "kexec", pid 8548, jiffies 4294953647 (age 32443.270s) hex dump (first 32 bytes): 00 00 00 c8 20 00 00 00 00 00 00 c0 02 80 00 00 .... ........... 40 40 40 40 40 40 40 40 00 00 00 00 00 00 00 00 @@@@@@@@........ backtrace: [<0000000011a2f199>] __vmalloc_node_range+0xc0/0x140 [<0000000081fa2752>] vzalloc+0x5a/0x70 [<0000000063a4c92d>] ipl_report_finish+0x2c/0x180 [<00000000553304da>] kexec_file_add_ipl_report+0xf4/0x150 [<00000000862d033f>] kexec_file_add_components+0x124/0x160 [<000000000d2717bb>] arch_kexec_kernel_image_load+0x62/0x90 [<000000002e0373b6>] kimage_file_alloc_init+0x1aa/0x2e0 [<0000000060f2d14f>] __do_sys_kexec_file_load+0x17c/0x2c0 [<000000008c86fe5a>] __s390x_sys_kexec_file_load+0x40/0x50 [<000000001fdb9dac>] __do_syscall+0x1bc/0x1f0 [<000000003ee4258d>] system_call+0x78/0xa0 Signed-off-by: Baoquan He Reviewed-by: Philipp Rudo Fixes: 99feaa717e55 ("s390/kexec_file: Create ipl report and pass to next kernel") Cc: # v5.2: 20c76e242e70: s390/kexec: fix return code handling Cc: # v5.2 Link: https://lore.kernel.org/r/20211116033101.GD21646@MiWiFi-R3L-srv Signed-off-by: Heiko Carstens --- arch/s390/include/asm/kexec.h | 6 ++++++ arch/s390/kernel/machine_kexec_file.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index ea398a05f643..7f3c9ac34bd8 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -74,6 +74,12 @@ void *kexec_file_add_components(struct kimage *image, int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, unsigned long addr); +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + void *ipl_buf; +}; + extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index f0200b503f94..9975ad200d74 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -206,6 +207,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, goto out; buf.bufsz = data->report->size; buf.memsz = buf.bufsz; + image->arch.ipl_buf = buf.buffer; data->memsz += buf.memsz; @@ -328,3 +330,11 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, } return 0; } + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->arch.ipl_buf); + image->arch.ipl_buf = NULL; + + return kexec_image_post_load_cleanup_default(image); +} From f1ab2e0d4cbd5d81bf9be187b38192efba3d96e7 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 16 Nov 2021 14:58:03 +0100 Subject: [PATCH 303/336] MAINTAINERS: update email address of Christian Borntraeger My borntraeger@de.ibm.com email is just a forwarder to the linux.ibm.com address. Let us remove the extra hop to avoid a potential source of errors. While at it, add the relevant email addresses to mailmap. Signed-off-by: Christian Borntraeger Link: https://lore.kernel.org/r/20211116135803.119489-2-borntraeger@linux.ibm.com Signed-off-by: Heiko Carstens --- .mailmap | 3 +++ MAINTAINERS | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 14314e3c5d5e..6277bb27b4bf 100644 --- a/.mailmap +++ b/.mailmap @@ -71,6 +71,9 @@ Chao Yu Chao Yu Chris Chiu Chris Chiu +Christian Borntraeger +Christian Borntraeger +Christian Borntraeger Christophe Ricard Christoph Hellwig Colin Ian King diff --git a/MAINTAINERS b/MAINTAINERS index 7a2345ce8521..b9a09edb3efb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10445,7 +10445,7 @@ F: arch/riscv/include/uapi/asm/kvm* F: arch/riscv/kvm/ KERNEL VIRTUAL MACHINE for s390 (KVM/s390) -M: Christian Borntraeger +M: Christian Borntraeger M: Janosch Frank R: David Hildenbrand R: Claudio Imbrenda @@ -16573,7 +16573,7 @@ F: drivers/video/fbdev/savage/ S390 M: Heiko Carstens M: Vasily Gorbik -M: Christian Borntraeger +M: Christian Borntraeger R: Alexander Gordeev L: linux-s390@vger.kernel.org S: Supported From 503e45108451dd4227c6f15c52ba459c29a86840 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 15 Nov 2021 20:56:13 +0100 Subject: [PATCH 304/336] ftrace/samples: add missing Kconfig option for ftrace direct multi sample Currently it is not possible to build the ftrace direct multi example anymore due to broken config dependencies. Fix this by adding SAMPLE_FTRACE_DIRECT_MULTI config option. This broke when merging s390-5.16-1 due to an incorrect merge conflict resolution proposed by me. Also rename SAMPLE_FTRACE_MULTI_DIRECT to SAMPLE_FTRACE_DIRECT_MULTI so it matches the module name. Fixes: 0b707e572a19 ("Merge tag 's390-5.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux") Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20211115195614.3173346-2-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/x86/Kconfig | 2 +- samples/Kconfig | 11 ++++++++++- samples/Makefile | 2 +- samples/ftrace/Makefile | 2 +- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 95dd1ee01546..7399327d1eff 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -193,7 +193,7 @@ config X86 select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_SAMPLE_FTRACE_DIRECT if X86_64 - select HAVE_SAMPLE_FTRACE_MULTI_DIRECT if X86_64 + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA diff --git a/samples/Kconfig b/samples/Kconfig index bec3528aa2de..43d2e9aa557f 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -31,6 +31,15 @@ config SAMPLE_FTRACE_DIRECT This builds an ftrace direct function example that hooks to wake_up_process and prints the parameters. +config SAMPLE_FTRACE_DIRECT_MULTI + tristate "Build register_ftrace_direct_multi() example" + depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS && m + depends on HAVE_SAMPLE_FTRACE_DIRECT_MULTI + help + This builds an ftrace direct function example + that hooks to wake_up_process and schedule, and prints + the function addresses. + config SAMPLE_TRACE_ARRAY tristate "Build sample module for kernel access to Ftrace instancess" depends on EVENT_TRACING && m @@ -237,5 +246,5 @@ endif # SAMPLES config HAVE_SAMPLE_FTRACE_DIRECT bool -config HAVE_SAMPLE_FTRACE_MULTI_DIRECT +config HAVE_SAMPLE_FTRACE_DIRECT_MULTI bool diff --git a/samples/Makefile b/samples/Makefile index b7b98307c2b4..4bcd6b93bffa 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -22,7 +22,7 @@ subdir-$(CONFIG_SAMPLE_TIMER) += timers obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace_events/ obj-$(CONFIG_SAMPLE_TRACE_PRINTK) += trace_printk/ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace/ -obj-$(CONFIG_SAMPLE_FTRACE_MULTI_DIRECT) += ftrace/ +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace/ obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += ftrace/ subdir-$(CONFIG_SAMPLE_UHID) += uhid obj-$(CONFIG_VIDEO_PCI_SKELETON) += v4l/ diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index e8a3f8520a44..b9198e2eef28 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o -obj-$(CONFIG_SAMPLE_FTRACE_MULTI_DIRECT) += ftrace-direct-multi.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT_MULTI) += ftrace-direct-multi.o CFLAGS_sample-trace-array.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o From 890e3dc8bb6ee630870560c34054692f7a45da42 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 15 Nov 2021 20:56:14 +0100 Subject: [PATCH 305/336] ftrace/samples: add s390 support for ftrace direct multi sample Add s390 architecture support for the ftrace direct multi sample. See commit 5fae941b9a6f ("ftrace/samples: Add multi direct interface test module") for further details. Link: https://lore.kernel.org/r/20211115195614.3173346-3-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + samples/ftrace/ftrace-direct-multi.c | 30 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 35f99b8f236e..2a5bb4f29cfe 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -194,6 +194,7 @@ config S390 select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ select HAVE_SAMPLE_FTRACE_DIRECT + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c index b6d7806b400e..2fafc9afcbf0 100644 --- a/samples/ftrace/ftrace-direct-multi.c +++ b/samples/ftrace/ftrace-direct-multi.c @@ -4,6 +4,7 @@ #include /* for handle_mm_fault() */ #include #include +#include extern void my_direct_func(unsigned long ip); @@ -14,6 +15,8 @@ void my_direct_func(unsigned long ip) extern void my_tramp(void *); +#ifdef CONFIG_X86_64 + asm ( " .pushsection .text, \"ax\", @progbits\n" " .type my_tramp, @function\n" @@ -31,6 +34,33 @@ asm ( " .popsection\n" ); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_S390 + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" lgr %r1,%r15\n" +" stmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" stg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" aghi %r15,"__stringify(-STACK_FRAME_OVERHEAD)"\n" +" stg %r1,"__stringify(__SF_BACKCHAIN)"(%r15)\n" +" lgr %r2,%r0\n" +" brasl %r14,my_direct_func\n" +" aghi %r15,"__stringify(STACK_FRAME_OVERHEAD)"\n" +" lmg %r0,%r5,"__stringify(__SF_GPRS)"(%r15)\n" +" lg %r14,"__stringify(__SF_GPRS+8*8)"(%r15)\n" +" lgr %r1,%r0\n" +" br %r1\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +#endif /* CONFIG_S390 */ + static struct ftrace_ops direct; static int __init ftrace_direct_multi_init(void) From f86b0aaad741c45aba5a84a27277dd56a96808ba Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 17 Nov 2021 17:15:42 -0800 Subject: [PATCH 306/336] tracing/histogram: Fix UAF in destroy_hist_field() Calling destroy_hist_field() on an expression will recursively free any operands associated with the expression. If during expression parsing the operands of the expression are already set when an error is encountered, there is no need to explicity free the operands. Doing so will result in destroy_hist_field() being called twice for the operands and lead to a use-after-free (UAF) error. If the operands are associated with the expression, only call destroy_hist_field() on the expression since the operands will be recursively freed. Link: https://lore.kernel.org/all/CAHk-=wgcrEbFgkw9720H3tW-AhHOoEKhYwZinYJw4FpzSaJ6_Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211118011542.1420131-1-kaleshsingh@google.com Suggested-by: Linus Torvalds Signed-off-by: Kalesh Singh Fixes: 8b5d46fd7a38 ("tracing/histogram: Optimize division by constants") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 43 +++++++++++++++++--------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5ea2c9ec54a6..9555b8e1d1e3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2576,28 +2576,27 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, /* Split the expression string at the root operator */ if (!sep) - goto free; + return ERR_PTR(-EINVAL); + *sep = '\0'; operand1_str = str; str = sep+1; /* Binary operator requires both operands */ if (*operand1_str == '\0' || *str == '\0') - goto free; + return ERR_PTR(-EINVAL); operand_flags = 0; /* LHS of string is an expression e.g. a+b in a+b+c */ operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); - if (IS_ERR(operand1)) { - ret = PTR_ERR(operand1); - operand1 = NULL; - goto free; - } + if (IS_ERR(operand1)) + return ERR_CAST(operand1); + if (operand1->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str)); ret = -EINVAL; - goto free; + goto free_op1; } /* RHS of string is another expression e.g. c in a+b+c */ @@ -2605,13 +2604,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand2)) { ret = PTR_ERR(operand2); - operand2 = NULL; - goto free; + goto free_op1; } if (operand2->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); ret = -EINVAL; - goto free; + goto free_operands; } switch (field_op) { @@ -2629,12 +2627,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, break; default: ret = -EINVAL; - goto free; + goto free_operands; } ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); if (ret) - goto free; + goto free_operands; operand_flags = var1 ? var1->flags : operand1->flags; operand2_flags = var2 ? var2->flags : operand2->flags; @@ -2653,12 +2651,13 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr = create_hist_field(hist_data, NULL, flags, var_name); if (!expr) { ret = -ENOMEM; - goto free; + goto free_operands; } operand1->read_once = true; operand2->read_once = true; + /* The operands are now owned and free'd by 'expr' */ expr->operands[0] = operand1; expr->operands[1] = operand2; @@ -2669,7 +2668,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, if (!divisor) { hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str)); ret = -EDOM; - goto free; + goto free_expr; } /* @@ -2709,18 +2708,22 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; - goto free; + goto free_expr; } expr->name = expr_str(expr, 0); } return expr; -free: - destroy_hist_field(operand1, 0); - destroy_hist_field(operand2, 0); - destroy_hist_field(expr, 0); +free_operands: + destroy_hist_field(operand2, 0); +free_op1: + destroy_hist_field(operand1, 0); + return ERR_PTR(ret); + +free_expr: + destroy_hist_field(expr, 0); return ERR_PTR(ret); } From ae8d67b2117f1ec6c8170d6e1af8ded17392bd2c Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 15 Nov 2021 19:08:19 -0800 Subject: [PATCH 307/336] lib: zstd: Fix unused variable warning The variable `litLengthSum` is only used by an `assert()`, so when asserts are disabled the compiler doesn't see any usage and warns. This issue is already fixed upstream by PR #2838 [0]. It was reported by the Kernel test robot in [1]. Another approach would be to change zstd's disabled `assert()` definition to use the argument in a disabled branch, instead of ignoring the argument. I've avoided this approach because there are some small changes necessary to get zstd to build, and I would want to thoroughly re-test for performance, since that is slightly changing the code in every function in zstd. It seems like a trivial change, but some functions are pretty sensitive to small changes. However, I think it is a valid approach that I would like to see upstream take, so I've opened Issue #2868 to attempt this upstream. Lastly, I've chosen not to use __maybe_unused because all code in lib/zstd/ must eventually be upstreamed. Upstream zstd can't use __maybe_unused because it isn't portable across all compilers. [0] https://github.com/facebook/zstd/pull/2838 [1] https://lore.kernel.org/linux-mm/202111120312.833wII4i-lkp@intel.com/T/ [2] https://github.com/facebook/zstd/issues/2868 Link: https://lore.kernel.org/r/20211117014949.1169186-2-nickrterrell@gmail.com/ Link: https://lore.kernel.org/r/20211117201459.1194876-2-nickrterrell@gmail.com/ Reported-by: kernel test robot Signed-off-by: Nick Terrell --- lib/zstd/compress/zstd_compress_superblock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c index ee03e0aedb03..b0610b255653 100644 --- a/lib/zstd/compress/zstd_compress_superblock.c +++ b/lib/zstd/compress/zstd_compress_superblock.c @@ -411,6 +411,8 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* const seqDef* sp = sstart; size_t matchLengthSum = 0; size_t litLengthSum = 0; + /* Only used by assert(), suppress unused variable warnings in production. */ + (void)litLengthSum; while (send-sp > 0) { ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); litLengthSum += seqLen.litLength; From 1974990cca43a6ba708a70b15862113eb9c2f399 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Mon, 15 Nov 2021 20:33:08 -0800 Subject: [PATCH 308/336] lib: zstd: Don't inline functions in zstd_opt.c `zstd_opt.c` contains the match finder for the highest compression levels. These levels are already very slow, and are unlikely to be used in the kernel. If they are used, they shouldn't be used in latency sensitive workloads, so slowing them down shouldn't be a big deal. This saves 188 KB of the 288 KB regression reported by Geert Uytterhoeven [0]. I've also opened an issue upstream [1] so that we can properly tackle the code size issue in `zstd_opt.c` for all users, and can hopefully remove this hack in the next zstd version we import. Bloat-o-meter output on x86-64: ``` > ../scripts/bloat-o-meter vmlinux.old vmlinux add/remove: 6/5 grow/shrink: 1/9 up/down: 16673/-209939 (-193266) Function old new delta ZSTD_compressBlock_opt_generic.constprop - 7559 +7559 ZSTD_insertBtAndGetAllMatches - 6304 +6304 ZSTD_insertBt1 - 1731 +1731 ZSTD_storeSeq - 693 +693 ZSTD_BtGetAllMatches - 255 +255 ZSTD_updateRep - 128 +128 ZSTD_updateTree 96 99 +3 ZSTD_insertAndFindFirstIndexHash3 81 - -81 ZSTD_setBasePrices.constprop 98 - -98 ZSTD_litLengthPrice.constprop 138 - -138 ZSTD_count 362 181 -181 ZSTD_count_2segments 1407 938 -469 ZSTD_insertBt1.constprop 2689 - -2689 ZSTD_compressBlock_btultra2 19990 423 -19567 ZSTD_compressBlock_btultra 19633 15 -19618 ZSTD_initStats_ultra 19825 - -19825 ZSTD_compressBlock_btopt 20374 12 -20362 ZSTD_compressBlock_btopt_extDict 29984 12 -29972 ZSTD_compressBlock_btultra_extDict 30718 15 -30703 ZSTD_compressBlock_btopt_dictMatchState 32689 12 -32677 ZSTD_compressBlock_btultra_dictMatchState 33574 15 -33559 Total: Before=6611828, After=6418562, chg -2.92% ``` [0] https://lkml.org/lkml/2021/11/14/189 [1] https://github.com/facebook/zstd/issues/2862 Link: https://lore.kernel.org/r/20211117014949.1169186-3-nickrterrell@gmail.com/ Link: https://lore.kernel.org/r/20211117201459.1194876-3-nickrterrell@gmail.com/ Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Reviewed-by: Geert Uytterhoeven Signed-off-by: Nick Terrell --- lib/zstd/common/compiler.h | 7 +++++++ lib/zstd/compress/zstd_opt.c | 12 ++++++++++++ 2 files changed, 19 insertions(+) diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h index a1a051e4bce6..f5a9c70a228a 100644 --- a/lib/zstd/common/compiler.h +++ b/lib/zstd/common/compiler.h @@ -16,6 +16,7 @@ *********************************************************/ /* force inlining */ +#if !defined(ZSTD_NO_INLINE) #if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ # define INLINE_KEYWORD inline #else @@ -24,6 +25,12 @@ #define FORCE_INLINE_ATTR __attribute__((always_inline)) +#else + +#define INLINE_KEYWORD +#define FORCE_INLINE_ATTR + +#endif /* On MSVC qsort requires that functions passed into it use the __cdecl calling conversion(CC). diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c index 04337050fe9a..dfc55e3e8119 100644 --- a/lib/zstd/compress/zstd_opt.c +++ b/lib/zstd/compress/zstd_opt.c @@ -8,6 +8,18 @@ * You may select, at your option, one of the above-listed licenses. */ +/* + * Disable inlining for the optimal parser for the kernel build. + * It is unlikely to be used in the kernel, and where it is used + * latency shouldn't matter because it is very slow to begin with. + * We prefer a ~180KB binary size win over faster optimal parsing. + * + * TODO(https://github.com/facebook/zstd/issues/2862): + * Improve the code size of the optimal parser in general, so we + * don't need this hack for the kernel build. + */ +#define ZSTD_NO_INLINE 1 + #include "zstd_compress_internal.h" #include "hist.h" #include "zstd_opt.h" From 7416cdc9b9c10968c57b1f73be5d48b3ecdaf3c8 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 16 Nov 2021 15:11:39 -0800 Subject: [PATCH 309/336] lib: zstd: Don't add -O3 to cflags After the update to zstd-1.4.10 passing -O3 is no longer necessary to get good performance from zstd. Using the default optimization level -O2 is sufficient to get good performance. I've measured no significant change to compression speed, and a ~1% decompression speed loss, which is acceptable. This fixes the reported parisc -Wframe-larger-than=1536 errors [0]. The gcc-8-hppa-linux-gnu compiler performed very poorly with -O3, generating stacks that are ~3KB. With -O2 these same functions generate stacks in the < 100B, completely fixing the problem. Function size deltas are listed below: ZSTD_compressBlock_fast_extDict_generic: 3800 -> 68 ZSTD_compressBlock_fast: 2216 -> 40 ZSTD_compressBlock_fast_dictMatchState: 1848 -> 64 ZSTD_compressBlock_doubleFast_extDict_generic: 3744 -> 76 ZSTD_fillDoubleHashTable: 3252 -> 0 ZSTD_compressBlock_doubleFast: 5856 -> 36 ZSTD_compressBlock_doubleFast_dictMatchState: 5380 -> 84 ZSTD_copmressBlock_lazy2: 2420 -> 72 Additionally, this improves the reported code bloat [1]. With gcc-11 bloat-o-meter shows an 80KB code size improvement: ``` > ../scripts/bloat-o-meter vmlinux.old vmlinux add/remove: 31/8 grow/shrink: 24/155 up/down: 25734/-107924 (-82190) Total: Before=6418562, After=6336372, chg -1.28% ``` Compared to before the zstd-1.4.10 update we see a total code size regression of 105KB, down from 374KB at v5.16-rc1: ``` > ../scripts/bloat-o-meter vmlinux.old vmlinux add/remove: 292/62 grow/shrink: 56/88 up/down: 235009/-127487 (107522) Total: Before=6228850, After=6336372, chg +1.73% ``` [0] https://lkml.org/lkml/2021/11/15/710 [1] https://lkml.org/lkml/2021/11/14/189 Link: https://lore.kernel.org/r/20211117014949.1169186-4-nickrterrell@gmail.com/ Link: https://lore.kernel.org/r/20211117201459.1194876-4-nickrterrell@gmail.com/ Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Reviewed-by: Geert Uytterhoeven Signed-off-by: Nick Terrell --- lib/zstd/Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile index 65218ec5b8f2..fc45339fc3a3 100644 --- a/lib/zstd/Makefile +++ b/lib/zstd/Makefile @@ -11,8 +11,6 @@ obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o -ccflags-y += -O3 - zstd_compress-y := \ zstd_compress_module.o \ common/debug.o \ From c4c1dbcc09e723295969a62aff401815b7ee15f4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Nov 2021 12:22:17 -0800 Subject: [PATCH 310/336] tracing: Use memset_startat() to zero struct trace_iterator In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memset(), avoid intentionally writing across neighboring fields. Use memset_startat() to avoid confusing memset() about writing beyond the target struct member. Link: https://lkml.kernel.org/r/20211118202217.1285588-1-keescook@chromium.org Signed-off-by: Kees Cook Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f9139dc1262c..e3c80cfd4eec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6706,9 +6706,7 @@ waitagain: cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset(&iter->seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + memset_startat(iter, 0, seq); cpumask_clear(iter->started); trace_seq_init(&iter->seq); iter->pos = -1; From 2ef75e9bd2c998f1c6f6f23a3744136105ddefd5 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Thu, 18 Nov 2021 17:55:16 +0300 Subject: [PATCH 311/336] tracing: Don't use out-of-sync va_list in event printing If trace_seq becomes full, trace_seq_vprintf() no longer consumes arguments from va_list, making va_list out of sync with format processing by trace_check_vprintf(). This causes va_arg() in trace_check_vprintf() to return wrong positional argument, which results into a WARN_ON_ONCE() hit. ftrace_stress_test from LTP triggers this situation. Fix it by explicitly avoiding further use if va_list at the point when it's consistency can no longer be guaranteed. Link: https://lkml.kernel.org/r/20211118145516.13219-1-nikita.yushchenko@virtuozzo.com Signed-off-by: Nikita Yushchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3c80cfd4eec..88de94da596b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3812,6 +3812,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); + /* + * If iter->seq is full, the above call no longer guarantees + * that ap is in sync with fmt processing, and further calls + * to va_arg() can return wrong positional arguments. + * + * Ensure that ap is no longer used in this case. + */ + if (iter->seq.full) { + p = ""; + break; + } + if (star) len = va_arg(ap, int); From 12c484c12b1995dd83862ef51e03f9b2d99e9060 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 17 Nov 2021 14:17:05 +0530 Subject: [PATCH 312/336] RISC-V: Enable KVM in RV64 and RV32 defconfigs as a module Let's enable KVM RISC-V in RV64 and RV32 defconfigs as module so that it always built along with the default kernel image. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 2 ++ arch/riscv/configs/rv32_defconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index c252fd5706d2..ef473e2f503b 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -19,6 +19,8 @@ CONFIG_SOC_VIRT=y CONFIG_SOC_MICROCHIP_POLARFIRE=y CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig index 434ef5b64599..6e9f12ff968a 100644 --- a/arch/riscv/configs/rv32_defconfig +++ b/arch/riscv/configs/rv32_defconfig @@ -19,6 +19,8 @@ CONFIG_SOC_VIRT=y CONFIG_ARCH_RV32I=y CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y From 5a19c7e06236a9c55dfc001bb4d1a8f1950d23e7 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 2 Nov 2021 16:51:43 +0100 Subject: [PATCH 313/336] riscv: fix building external modules When building external modules, vdso_prepare should not be run. If the kernel sources are read-only, it will fail. Fixes: fde9c59aebaf ("riscv: explicitly use symbol offsets for VDSO") Signed-off-by: Andreas Schwab Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 5927c94302b8..8a107ed18b0d 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -107,11 +107,13 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ +ifeq ($(KBUILD_EXTMOD),) ifeq ($(CONFIG_MMU),y) prepare: vdso_prepare vdso_prepare: prepare0 $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso include/generated/vdso-offsets.h endif +endif ifneq ($(CONFIG_XIP_KERNEL),y) ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN),yy) From 15c30104965101b8e76b24d27035569d6613a7d6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 2 Nov 2021 10:07:05 +0800 Subject: [PATCH 314/336] blk-cgroup: fix missing put device in error path from blkg_conf_pref() If blk_queue_enter() failed due to queue is dying, the blkdev_put_no_open() is needed because blkcg_conf_open_bdev() succeeded. Fixes: 0c9d338c8443 ("blk-cgroup: synchronize blkg creation against policy deactivation") Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20211102020705.2321858-1-yukuai3@huawei.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 88b1fce90520..663aabfeba18 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -640,7 +640,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, */ ret = blk_queue_enter(q, 0); if (ret) - return ret; + goto fail; rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -676,13 +676,13 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, new_blkg = blkg_alloc(pos, q, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail; + goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail; + goto fail_exit_queue; } rcu_read_lock(); @@ -722,9 +722,10 @@ fail_preloaded: fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); +fail_exit_queue: + blk_queue_exit(q); fail: blkdev_put_no_open(bdev); - blk_queue_exit(q); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue From 2b504bd4841bccbf3eb83c1fec229b65956ad8ad Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 18 Nov 2021 23:30:41 +0800 Subject: [PATCH 315/336] blk-mq: don't insert FUA request with data into scheduler queue We never insert flush request into scheduler queue before. Recently commit d92ca9d8348f ("blk-mq: don't handle non-flush requests in blk_insert_flush") tries to handle FUA data request as normal request. This way has caused warning[1] in mq-deadline dd_exit_sched() or io hang in case of kyber since RQF_ELVPRIV isn't set for flush request, then ->finish_request won't be called. Fix the issue by inserting FUA data request with blk_mq_request_bypass_insert() when the device supports FUA, just like what we did before. [1] https://lore.kernel.org/linux-block/CAHj4cs-_vkTW=dAzbZYGxpEWSpzpcmaNeY1R=vH311+9vMUSdg@mail.gmail.com/ Reported-by: Yi Zhang Fixes: d92ca9d8348f ("blk-mq: don't handle non-flush requests in blk_insert_flush") Cc: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20211118153041.2163228-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-flush.c | 12 ++++++------ block/blk-mq.c | 4 +++- block/blk.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 8e364bda5166..1fce6d16e6d3 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -379,7 +379,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. */ -bool blk_insert_flush(struct request *rq) +void blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; unsigned long fflags = q->queue_flags; /* may change, cache */ @@ -409,7 +409,7 @@ bool blk_insert_flush(struct request *rq) */ if (!policy) { blk_mq_end_request(rq, 0); - return true; + return; } BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ @@ -420,8 +420,10 @@ bool blk_insert_flush(struct request *rq) * for normal execution. */ if ((policy & REQ_FSEQ_DATA) && - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) - return false; + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + blk_mq_request_bypass_insert(rq, false, true); + return; + } /* * @rq should go through flush machinery. Mark it part of flush @@ -437,8 +439,6 @@ bool blk_insert_flush(struct request *rq) spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); - - return true; } /** diff --git a/block/blk-mq.c b/block/blk-mq.c index eecbd7e6fea2..8799fa73ef34 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2647,8 +2647,10 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) + if (op_is_flush(bio->bi_opf)) { + blk_insert_flush(rq); return; + } if (plug && (q->nr_hw_queues == 1 || blk_mq_is_shared_tags(rq->mq_hctx->flags) || diff --git a/block/blk.h b/block/blk.h index b4fed2033e48..ccde6e6f1736 100644 --- a/block/blk.h +++ b/block/blk.h @@ -271,7 +271,7 @@ void __blk_account_io_done(struct request *req, u64 now); */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) -bool blk_insert_flush(struct request *rq); +void blk_insert_flush(struct request *rq); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); From df4e6faaafe2e4ff4dcdf6d5f5b1e2cb1fec63f7 Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Fri, 19 Nov 2021 03:19:30 -0800 Subject: [PATCH 316/336] MAINTAINERS: Update for VMware PVRDMA driver Update maintainer info for the VMware PVRDMA driver. Link: https://lore.kernel.org/r/1637320770-44878-1-git-send-email-bryantan@vmware.com Reviewed-by: Adit Ranadive Reviewed-by: Vishnu Dasa Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7a2345ce8521..ecf376f8f82e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20317,7 +20317,8 @@ F: arch/x86/include/asm/vmware.h F: arch/x86/kernel/cpu/vmware.c VMWARE PVRDMA DRIVER -M: Adit Ranadive +M: Bryan Tan +M: Vishnu Dasa M: VMware PV-Drivers L: linux-rdma@vger.kernel.org S: Maintained From e349d945fac76bddc78ae1cb92a0145b427a87ce Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 11:11:13 -0600 Subject: [PATCH 317/336] signal: Don't always set SA_IMMUTABLE for forced signals Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Update force_sig_to_task to support both the case when we can allow the debugger to intercept and possibly ignore the signal and the case when it is not safe to let userspace know about the signal until the process has exited. Suggested-by: Linus Torvalds Reported-by: Kyle Huey Reported-by: kernel test robot Cc: stable@vger.kernel.org [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Link: https://lkml.kernel.org/r/877dd5qfw5.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 7c4b7ae714d4..7815e1bbeddc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1298,6 +1298,12 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p return ret; } +enum sig_handler { + HANDLER_CURRENT, /* If reachable use the current handler */ + HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */ + HANDLER_EXIT, /* Only visible as the process exit code */ +}; + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1310,7 +1316,8 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * that is why we also clear SIGNAL_UNKILLABLE. */ static int -force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool sigdfl) +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, + enum sig_handler handler) { unsigned long int flags; int ret, blocked, ignored; @@ -1321,9 +1328,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); - if (blocked || ignored || sigdfl) { + if (blocked || ignored || (handler != HANDLER_CURRENT)) { action->sa.sa_handler = SIG_DFL; - action->sa.sa_flags |= SA_IMMUTABLE; + if (handler == HANDLER_EXIT) + action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) { sigdelset(&t->blocked, sig); recalc_sigpending_and_wake(t); @@ -1343,7 +1351,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool int force_sig_info(struct kernel_siginfo *info) { - return force_sig_info_to_task(info, current, false); + return force_sig_info_to_task(info, current, HANDLER_CURRENT); } /* @@ -1660,7 +1668,7 @@ void force_fatal_sig(int sig) info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - force_sig_info_to_task(&info, current, true); + force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } /* @@ -1693,7 +1701,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info_to_task(&info, t, false); + return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } int force_sig_fault(int sig, int code, void __user *addr @@ -1813,7 +1821,8 @@ int force_sig_seccomp(int syscall, int reason, bool force_coredump) info.si_errno = reason; info.si_arch = syscall_get_arch(current); info.si_syscall = syscall; - return force_sig_info_to_task(&info, current, force_coredump); + return force_sig_info_to_task(&info, current, + force_coredump ? HANDLER_EXIT : HANDLER_CURRENT); } /* For the crazy architectures that include trap information in From fcb116bc43c8c37c052530ead79872f8b2615711 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 14:23:21 -0600 Subject: [PATCH 318/336] signal: Replace force_fatal_sig with force_exit_sig when in doubt Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Add force_exit_sig and use it instead of force_fatal_sig where historically the code has directly called do_exit. This has the implementation benefits of going through the signal exit path (including generating core dumps) without the danger of allowing userspace to ignore or change these signals. This avoids userspace regressions as older kernels exited with do_exit which debuggers also can not intercept. In the future is should be possible to improve the quality of implementation of the kernel by changing some of these force_exit_sig calls to force_fatal_sig. That can be done where it matters on a case-by-case basis with careful analysis. Reported-by: Kyle Huey Reported-by: kernel test robot [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Fixes: a3616a3c0272 ("signal/m68k: Use force_sigsegv(SIGSEGV) in fpsp040_die") Fixes: 83a1f27ad773 ("signal/powerpc: On swapcontext failure force SIGSEGV") Fixes: 9bc508cf0791 ("signal/s390: Use force_sigsegv in default_trap_handler") Fixes: 086ec444f866 ("signal/sparc32: In setup_rt_frame and setup_fram use force_fatal_sig") Fixes: c317d306d550 ("signal/sparc32: Exit with a fatal signal when try_to_clear_window_buffer fails") Fixes: 695dd0d634df ("signal/x86: In emulate_vsyscall force a signal instead of calling do_exit") Fixes: 1fbd60df8a85 ("signal/vm86_32: Properly send SIGSEGV when the vm86 state cannot be saved.") Fixes: 941edc5bf174 ("exit/syscall_user_dispatch: Send ordinary signals on failure") Link: https://lkml.kernel.org/r/871r3dqfv8.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- arch/m68k/kernel/traps.c | 2 +- arch/powerpc/kernel/signal_32.c | 2 +- arch/powerpc/kernel/signal_64.c | 4 ++-- arch/s390/kernel/traps.c | 2 +- arch/sparc/kernel/signal_32.c | 4 ++-- arch/sparc/kernel/windows.c | 2 +- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- include/linux/sched/signal.h | 1 + kernel/entry/syscall_user_dispatch.c | 4 ++-- kernel/signal.c | 13 +++++++++++++ 11 files changed, 26 insertions(+), 12 deletions(-) diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 99058a6da956..34d6458340b0 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -1145,7 +1145,7 @@ asmlinkage void set_esp0(unsigned long ssp) */ asmlinkage void fpsp040_die(void) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); } #ifdef CONFIG_M68KFPU_EMU diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 00a9c9cd6d42..3e053e2fd6b6 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1063,7 +1063,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, * We kill the task with a SIGSEGV in this situation. */ if (do_setcontext(new_ctx, regs, 0)) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index ef518535d436..d1e1fc0acbea 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -704,7 +704,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, */ if (__get_user_sigset(&set, &new_ctx->uc_sigmask)) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } set_current_blocked(&set); @@ -713,7 +713,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, return -EFAULT; if (__unsafe_restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) { user_read_access_end(); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } user_read_access_end(); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 035705c9f23e..2b780786fc68 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -84,7 +84,7 @@ static void default_trap_handler(struct pt_regs *regs) { if (user_mode(regs)) { report_user_fault(regs, SIGSEGV, 0); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); } else die(regs, "Unknown program exception"); } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index cd677bc564a7..ffab16369bea 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -244,7 +244,7 @@ static int setup_frame(struct ksignal *ksig, struct pt_regs *regs, get_sigframe(ksig, regs, sigframe_size); if (invalid_frame_pointer(sf, sigframe_size)) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return -EINVAL; } @@ -336,7 +336,7 @@ static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs, sf = (struct rt_signal_frame __user *) get_sigframe(ksig, regs, sigframe_size); if (invalid_frame_pointer(sf, sigframe_size)) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return -EINVAL; } diff --git a/arch/sparc/kernel/windows.c b/arch/sparc/kernel/windows.c index bbbd40cc6b28..8f20862ccc83 100644 --- a/arch/sparc/kernel/windows.c +++ b/arch/sparc/kernel/windows.c @@ -122,7 +122,7 @@ void try_to_clear_window_buffer(struct pt_regs *regs, int who) if ((sp & 7) || copy_to_user((char __user *) sp, &tp->reg_window[window], sizeof(struct reg_window32))) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return; } } diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 0b6b277ee050..fd2ee9408e91 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -226,7 +226,7 @@ bool emulate_vsyscall(unsigned long error_code, if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); - force_fatal_sig(SIGSYS); + force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index cce1c89cb7df..c21bcd668284 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -160,7 +160,7 @@ Efault_end: user_access_end(); Efault: pr_alert("could not access userspace vm86 info\n"); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); goto exit_vm86; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23505394ef70..33a50642cf41 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -352,6 +352,7 @@ extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); +extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 4508201847d2..0b6379adff6b 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -48,7 +48,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) * the selector is loaded by userspace. */ if (unlikely(__get_user(state, sd->selector))) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return true; } @@ -56,7 +56,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) return false; if (state != SYSCALL_DISPATCH_FILTER_BLOCK) { - force_fatal_sig(SIGSYS); + force_exit_sig(SIGSYS); return true; } } diff --git a/kernel/signal.c b/kernel/signal.c index 7815e1bbeddc..a629b11bf3e0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1671,6 +1671,19 @@ void force_fatal_sig(int sig) force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } +void force_exit_sig(int sig) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + force_sig_info_to_task(&info, current, HANDLER_EXIT); +} + /* * When things go south during signal handling, we * will force a SIGSEGV. And if the signal that caused From 0dc636b3b757a6b747a156de613275f9d74a4a66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Nov 2021 10:29:47 +0100 Subject: [PATCH 319/336] x86: Pin task-stack in __get_wchan() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When commit 5d1ceb3969b6 ("x86: Fix __get_wchan() for !STACKTRACE") moved from stacktrace to native unwind_*() usage, the try_get_task_stack() got lost, leading to use-after-free issues for dying tasks. Signed-off-by: Peter Zijlstra (Intel) Fixes: 5d1ceb3969b6 ("x86: Fix __get_wchan() for !STACKTRACE") Link: https://bugzilla.kernel.org/show_bug.cgi?id=215031 Link: https://lore.kernel.org/stable/YZV02RCRVHIa144u@fedora64.linuxtx.org/ Reported-by: Justin Forbes Reported-by: Holger Hoffstätte Cc: Qi Zheng Cc: Kees Cook Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- arch/x86/kernel/process.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e9ee8b526319..04143a653a8a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -964,6 +964,9 @@ unsigned long __get_wchan(struct task_struct *p) struct unwind_state state; unsigned long addr = 0; + if (!try_get_task_stack(p)) + return 0; + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); @@ -974,6 +977,8 @@ unsigned long __get_wchan(struct task_struct *p) break; } + put_task_stack(p); + return addr; } From 3cd018b4d6f2153e57bb67703891aa4fc7ac5c94 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 19 Nov 2021 16:43:15 -0800 Subject: [PATCH 320/336] mm/swap.c:put_pages_list(): reinitialise the page list While free_unref_page_list() puts pages onto the CPU local LRU list, it does not remove them from the list they were passed in on. That makes the list_head appear to be non-empty, and would lead to various corruption problems if we didn't have an assertion that the list was empty. Reinitialise the list after calling free_unref_page_list() to avoid this problem. Link: https://lkml.kernel.org/r/YYp40A2lNrxaZji8@casper.infradead.org Fixes: 988c69f1bc23 ("mm: optimise put_pages_list()") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Steve French Reported-by: Namjae Jeon Tested-by: Steve French Tested-by: Namjae Jeon Cc: Steve French Cc: Hyeoncheol Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/swap.c b/mm/swap.c index 1841c24682f8..e8c9dc6d0377 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -156,6 +156,7 @@ void put_pages_list(struct list_head *pages) } free_unref_page_list(pages); + INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); From 126e8bee943e9926238c891e2df5b5573aee76bc Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 19 Nov 2021 16:43:18 -0800 Subject: [PATCH 321/336] ipc: WARN if trying to remove ipc object which is absent Patch series "shm: shm_rmid_forced feature fixes". Some time ago I met kernel crash after CRIU restore procedure, fortunately, it was CRIU restore, so, I had dump files and could do restore many times and crash reproduced easily. After some investigation I've constructed the minimal reproducer. It was found that it's use-after-free and it happens only if sysctl kernel.shm_rmid_forced = 1. The key of the problem is that the exit_shm() function not handles shp's object destroy when task->sysvshm.shm_clist contains items from different IPC namespaces. In most cases this list will contain only items from one IPC namespace. How can this list contain object from different namespaces? The exit_shm() function is designed to clean up this list always when process leaves IPC namespace. But we made a mistake a long time ago and did not add a exit_shm() call into the setns() syscall procedures. The first idea was just to add this call to setns() syscall but it obviously changes semantics of setns() syscall and that's userspace-visible change. So, I gave up on this idea. The first real attempt to address the issue was just to omit forced destroy if we meet shp object not from current task IPC namespace [1]. But that was not the best idea because task->sysvshm.shm_clist was protected by rwsem which belongs to current task IPC namespace. It means that list corruption may occur. Second approach is just extend exit_shm() to properly handle shp's from different IPC namespaces [2]. This is really non-trivial thing, I've put a lot of effort into that but not believed that it's possible to make it fully safe, clean and clear. Thanks to the efforts of Manfred Spraul working an elegant solution was designed. Thanks a lot, Manfred! Eric also suggested the way to address the issue in ("[RFC][PATCH] shm: In shm_exit destroy all created and never attached segments") Eric's idea was to maintain a list of shm_clists one per IPC namespace, use lock-less lists. But there is some extra memory consumption-related concerns. An alternative solution which was suggested by me was implemented in ("shm: reset shm_clist on setns but omit forced shm destroy"). The idea is pretty simple, we add exit_shm() syscall to setns() but DO NOT destroy shm segments even if sysctl kernel.shm_rmid_forced = 1, we just clean up the task->sysvshm.shm_clist list. This chages semantics of setns() syscall a little bit but in comparision to the "naive" solution when we just add exit_shm() without any special exclusions this looks like a safer option. [1] https://lkml.org/lkml/2021/7/6/1108 [2] https://lkml.org/lkml/2021/7/14/736 This patch (of 2): Let's produce a warning if we trying to remove non-existing IPC object from IPC namespace kht/idr structures. This allows us to catch possible bugs when the ipc_rmid() function was called with inconsistent struct ipc_ids*, struct kern_ipc_perm* arguments. Link: https://lkml.kernel.org/r/20211027224348.611025-1-alexander.mikhalitsyn@virtuozzo.com Link: https://lkml.kernel.org/r/20211027224348.611025-2-alexander.mikhalitsyn@virtuozzo.com Co-developed-by: Manfred Spraul Signed-off-by: Manfred Spraul Signed-off-by: Alexander Mikhalitsyn Cc: "Eric W. Biederman" Cc: Davidlohr Bueso Cc: Greg KH Cc: Andrei Vagin Cc: Pavel Tikhomirov Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index d48d8cfa1f3f..fa2d86ef3fb8 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -447,8 +447,8 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) - rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, - ipc_kht_params); + WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, + ipc_kht_params)); } /** @@ -498,7 +498,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); - idr_remove(&ids->ipcs_idr, idx); + WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 19 Nov 2021 16:43:21 -0800 Subject: [PATCH 322/336] shm: extend forced shm destroy to support objects from several IPC nses Currently, the exit_shm() function not designed to work properly when task->sysvshm.shm_clist holds shm objects from different IPC namespaces. This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it leads to use-after-free (reproducer exists). This is an attempt to fix the problem by extending exit_shm mechanism to handle shm's destroy from several IPC ns'es. To achieve that we do several things: 1. add a namespace (non-refcounted) pointer to the struct shmid_kernel 2. during new shm object creation (newseg()/shmget syscall) we initialize this pointer by current task IPC ns 3. exit_shm() fully reworked such that it traverses over all shp's in task->sysvshm.shm_clist and gets IPC namespace not from current task as it was before but from shp's object itself, then call shm_destroy(shp, ns). Note: We need to be really careful here, because as it was said before (1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we using special helper get_ipc_ns_not_zero() which allows to get IPC ns refcounter only if IPC ns not in the "state of destruction". Q/A Q: Why can we access shp->ns memory using non-refcounted pointer? A: Because shp object lifetime is always shorther than IPC namespace lifetime, so, if we get shp object from the task->sysvshm.shm_clist while holding task_lock(task) nobody can steal our namespace. Q: Does this patch change semantics of unshare/setns/clone syscalls? A: No. It's just fixes non-covered case when process may leave IPC namespace without getting task->sysvshm.shm_clist list cleaned up. Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") Co-developed-by: Manfred Spraul Signed-off-by: Manfred Spraul Signed-off-by: Alexander Mikhalitsyn Cc: "Eric W. Biederman" Cc: Davidlohr Bueso Cc: Greg KH Cc: Andrei Vagin Cc: Pavel Tikhomirov Cc: Vasily Averin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 15 +++ include/linux/sched/task.h | 2 +- ipc/shm.c | 197 +++++++++++++++++++++++++--------- 3 files changed, 163 insertions(+), 51 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 05e22770af51..b75395ec8d52 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -131,6 +131,16 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + if (ns) { + if (refcount_inc_not_zero(&ns->ns.count)) + return ns; + } + + return NULL; +} + extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, @@ -147,6 +157,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + return ns; +} + static inline void put_ipc_ns(struct ipc_namespace *ns) { } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ba88a6987400..058d7f371e25 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -158,7 +158,7 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. And ->vfork_done. + * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/ipc/shm.c b/ipc/shm.c index 4942bdd65748..b3048ebd5c31 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -62,9 +62,18 @@ struct shmid_kernel /* private to the kernel */ struct pid *shm_lprid; struct ucounts *mlock_ucounts; - /* The task created the shm object. NULL if the task is dead. */ + /* + * The task created the shm object, for + * task_lock(shp->shm_creator) + */ struct task_struct *shm_creator; - struct list_head shm_clist; /* list by creator */ + + /* + * List by creator. task_lock(->shm_creator) required for read/write. + * If list_empty(), then the creator is dead already. + */ + struct list_head shm_clist; + struct ipc_namespace *ns; } __randomize_layout; /* shm_mode upper byte flags */ @@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); + WARN_ON(ns != shp->ns); if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; @@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head *head) kfree(shp); } -static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) +/* + * It has to be called with shp locked. + * It must be called before ipc_rmid() + */ +static inline void shm_clist_rm(struct shmid_kernel *shp) { - list_del(&s->shm_clist); - ipc_rmid(&shm_ids(ns), &s->shm_perm); + struct task_struct *creator; + + /* ensure that shm_creator does not disappear */ + rcu_read_lock(); + + /* + * A concurrent exit_shm may do a list_del_init() as well. + * Just do nothing if exit_shm already did the work + */ + if (!list_empty(&shp->shm_clist)) { + /* + * shp->shm_creator is guaranteed to be valid *only* + * if shp->shm_clist is not empty. + */ + creator = shp->shm_creator; + + task_lock(creator); + /* + * list_del_init() is a nop if the entry was already removed + * from the list. + */ + list_del_init(&shp->shm_clist); + task_unlock(creator); + } + rcu_read_unlock(); +} + +static inline void shm_rmid(struct shmid_kernel *s) +{ + shm_clist_rm(s); + ipc_rmid(&shm_ids(s->ns), &s->shm_perm); } @@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid(ns, shp); + shm_rmid(shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) shmem_lock(shm_file, 0, shp->mlock_ucounts); @@ -303,10 +346,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) * * 2) sysctl kernel.shm_rmid_forced is set to 1. */ -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +static bool shm_may_destroy(struct shmid_kernel *shp) { return (shp->shm_nattch == 0) && - (ns->shm_rmid_forced || + (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); } @@ -337,7 +380,7 @@ static void shm_close(struct vm_area_struct *vma) ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); @@ -358,10 +401,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * * As shp->* are changed under rwsem, it's safe to skip shp locking. */ - if (shp->shm_creator != NULL) + if (!list_empty(&shp->shm_clist)) return 0; - if (shm_may_destroy(ns, shp)) { + if (shm_may_destroy(shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } @@ -379,48 +422,97 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { - struct ipc_namespace *ns = task->nsproxy->ipc_ns; - struct shmid_kernel *shp, *n; + for (;;) { + struct shmid_kernel *shp; + struct ipc_namespace *ns; - if (list_empty(&task->sysvshm.shm_clist)) - return; + task_lock(task); - /* - * If kernel.shm_rmid_forced is not set then only keep track of - * which shmids are orphaned, so that a later set of the sysctl - * can clean them up. - */ - if (!ns->shm_rmid_forced) { - down_read(&shm_ids(ns).rwsem); - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) - shp->shm_creator = NULL; - /* - * Only under read lock but we are only called on current - * so no entry on the list will be shared. - */ - list_del(&task->sysvshm.shm_clist); - up_read(&shm_ids(ns).rwsem); - return; - } - - /* - * Destroy all already created segments, that were not yet mapped, - * and mark any mapped as orphan to cover the sysctl toggling. - * Destroy is skipped if shm_may_destroy() returns false. - */ - down_write(&shm_ids(ns).rwsem); - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { - shp->shm_creator = NULL; - - if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); + if (list_empty(&task->sysvshm.shm_clist)) { + task_unlock(task); + break; } - } - /* Remove the list head from any segments still attached. */ - list_del(&task->sysvshm.shm_clist); - up_write(&shm_ids(ns).rwsem); + shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, + shm_clist); + + /* + * 1) Get pointer to the ipc namespace. It is worth to say + * that this pointer is guaranteed to be valid because + * shp lifetime is always shorter than namespace lifetime + * in which shp lives. + * We taken task_lock it means that shp won't be freed. + */ + ns = shp->ns; + + /* + * 2) If kernel.shm_rmid_forced is not set then only keep track of + * which shmids are orphaned, so that a later set of the sysctl + * can clean them up. + */ + if (!ns->shm_rmid_forced) + goto unlink_continue; + + /* + * 3) get a reference to the namespace. + * The refcount could be already 0. If it is 0, then + * the shm objects will be free by free_ipc_work(). + */ + ns = get_ipc_ns_not_zero(ns); + if (!ns) { +unlink_continue: + list_del_init(&shp->shm_clist); + task_unlock(task); + continue; + } + + /* + * 4) get a reference to shp. + * This cannot fail: shm_clist_rm() is called before + * ipc_rmid(), thus the refcount cannot be 0. + */ + WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); + + /* + * 5) unlink the shm segment from the list of segments + * created by current. + * This must be done last. After unlinking, + * only the refcounts obtained above prevent IPC_RMID + * from destroying the segment or the namespace. + */ + list_del_init(&shp->shm_clist); + + task_unlock(task); + + /* + * 6) we have all references + * Thus lock & if needed destroy shp. + */ + down_write(&shm_ids(ns).rwsem); + shm_lock_by_ptr(shp); + /* + * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's + * safe to call ipc_rcu_putref here + */ + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); + + if (ipc_valid_object(&shp->shm_perm)) { + if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + } else { + /* + * Someone else deleted the shp from namespace + * idr/kht while we have waited. + * Just unlock and continue. + */ + shm_unlock(shp); + } + + up_write(&shm_ids(ns).rwsem); + put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ + } } static vm_fault_t shm_fault(struct vm_fault *vmf) @@ -676,7 +768,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (error < 0) goto no_id; + shp->ns = ns; + + task_lock(current); list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); + task_unlock(current); /* * shmid gets reported as "inode#" in /proc/pid/maps. @@ -1567,7 +1663,8 @@ out_nattch: down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); From 9a543f007b702b0be4acacad416a0f90233b4558 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Fri, 19 Nov 2021 16:43:25 -0800 Subject: [PATCH 323/336] mm: emit the "free" trace report before freeing memory in kmem_cache_free() After the memory is freed, it can be immediately allocated by other CPUs, before the "free" trace report has been emitted. This causes inaccurate traces. For example, if the following sequence of events occurs: CPU 0 CPU 1 (1) alloc xxxxxx (2) free xxxxxx (3) alloc xxxxxx (4) free xxxxxx Then they will be inaccurately reported via tracing, so that they appear to have happened in this order: CPU 0 CPU 1 (1) alloc xxxxxx (2) alloc xxxxxx (3) free xxxxxx (4) free xxxxxx This makes it look like CPU 1 somehow managed to allocate memory that CPU 0 still had allocated for itself. In order to avoid this, emit the "free xxxxxx" tracing report just before the actual call to free the memory, instead of just after it. Link: https://lkml.kernel.org/r/374eb75d-7404-8721-4e1e-65b0e5b17279@huawei.com Signed-off-by: Yunfeng Ye Reviewed-by: Vlastimil Babka Reviewed-by: John Hubbard Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 +-- mm/slob.c | 3 +-- mm/slub.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index da132a9ae6f8..ca4822f6b2b6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3733,14 +3733,13 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) if (!cachep) return; + trace_kmem_cache_free(_RET_IP_, objp, cachep->name); local_irq_save(flags); debug_check_no_locks_freed(objp, cachep->object_size); if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(objp, cachep->object_size); __cache_free(cachep, objp, _RET_IP_); local_irq_restore(flags); - - trace_kmem_cache_free(_RET_IP_, objp, cachep->name); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index 74d3f6e60666..03deee1e6a94 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -666,6 +666,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); + trace_kmem_cache_free(_RET_IP_, b, c->name); if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); @@ -674,8 +675,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } else { __kmem_cache_free(b, c->size); } - - trace_kmem_cache_free(_RET_IP_, b, c->name); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index f7368bfffb7a..a8626825a829 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3526,8 +3526,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x) s = cache_from_obj(s, x); if (!s) return; - slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x, s->name); + slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); From ffb92ce826fd801acb0f4e15b75e4ddf0d189bde Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 19 Nov 2021 16:43:28 -0800 Subject: [PATCH 324/336] hexagon: export raw I/O routines for modules Patch series "Fixes for ARCH=hexagon allmodconfig", v2. This series fixes some issues noticed with ARCH=hexagon allmodconfig. This patch (of 3): When building ARCH=hexagon allmodconfig, the following errors occur: ERROR: modpost: "__raw_readsl" [drivers/i3c/master/svc-i3c-master.ko] undefined! ERROR: modpost: "__raw_writesl" [drivers/i3c/master/dw-i3c-master.ko] undefined! ERROR: modpost: "__raw_readsl" [drivers/i3c/master/dw-i3c-master.ko] undefined! ERROR: modpost: "__raw_writesl" [drivers/i3c/master/i3c-master-cdns.ko] undefined! ERROR: modpost: "__raw_readsl" [drivers/i3c/master/i3c-master-cdns.ko] undefined! Export these symbols so that modules can use them without any errors. Link: https://lkml.kernel.org/r/20211115174250.1994179-1-nathan@kernel.org Link: https://lkml.kernel.org/r/20211115174250.1994179-2-nathan@kernel.org Fixes: 013bf24c3829 ("Hexagon: Provide basic implementation and/or stubs for I/O routines.") Signed-off-by: Nathan Chancellor Acked-by: Brian Cain Cc: Nick Desaulniers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/lib/io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/hexagon/lib/io.c b/arch/hexagon/lib/io.c index d35d69d6588c..55f75392857b 100644 --- a/arch/hexagon/lib/io.c +++ b/arch/hexagon/lib/io.c @@ -27,6 +27,7 @@ void __raw_readsw(const void __iomem *addr, void *data, int len) *dst++ = *src; } +EXPORT_SYMBOL(__raw_readsw); /* * __raw_writesw - read words a short at a time @@ -47,6 +48,7 @@ void __raw_writesw(void __iomem *addr, const void *data, int len) } +EXPORT_SYMBOL(__raw_writesw); /* Pretty sure len is pre-adjusted for the length of the access already */ void __raw_readsl(const void __iomem *addr, void *data, int len) @@ -62,6 +64,7 @@ void __raw_readsl(const void __iomem *addr, void *data, int len) } +EXPORT_SYMBOL(__raw_readsl); void __raw_writesl(void __iomem *addr, const void *data, int len) { @@ -76,3 +79,4 @@ void __raw_writesl(void __iomem *addr, const void *data, int len) } +EXPORT_SYMBOL(__raw_writesl); From 51f2ec593441d3d1ebc0d478fac3ea329c7c93ac Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 19 Nov 2021 16:43:31 -0800 Subject: [PATCH 325/336] hexagon: clean up timer-regs.h When building allmodconfig, there is a warning about TIMER_ENABLE being redefined: drivers/clocksource/timer-oxnas-rps.c:39:9: error: 'TIMER_ENABLE' macro redefined [-Werror,-Wmacro-redefined] #define TIMER_ENABLE BIT(7) ^ arch/hexagon/include/asm/timer-regs.h:13:9: note: previous definition is here #define TIMER_ENABLE 0 ^ 1 error generated. The values in this header are only used in one file each, if they are used at all. Remove the header and sink all of the constants into their respective files. TCX0_CLK_RATE is only used in arch/hexagon/include/asm/timex.h TIMER_ENABLE, RTOS_TIMER_INT, RTOS_TIMER_REGS_ADDR are only used in arch/hexagon/kernel/time.c. SLEEP_CLK_RATE and TIMER_CLR_ON_MATCH have both been unused since the file's introduction in commit 71e4a47f32f4 ("Hexagon: Add time and timer functions"). TIMER_ENABLE is redefined as BIT(0) so the shift is moved into the definition, rather than its use. Link: https://lkml.kernel.org/r/20211115174250.1994179-3-nathan@kernel.org Signed-off-by: Nathan Chancellor Acked-by: Brian Cain Cc: Nick Desaulniers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/timer-regs.h | 26 -------------------------- arch/hexagon/include/asm/timex.h | 3 +-- arch/hexagon/kernel/time.c | 12 ++++++++++-- 3 files changed, 11 insertions(+), 30 deletions(-) delete mode 100644 arch/hexagon/include/asm/timer-regs.h diff --git a/arch/hexagon/include/asm/timer-regs.h b/arch/hexagon/include/asm/timer-regs.h deleted file mode 100644 index ee6c61423a05..000000000000 --- a/arch/hexagon/include/asm/timer-regs.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Timer support for Hexagon - * - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. - */ - -#ifndef _ASM_TIMER_REGS_H -#define _ASM_TIMER_REGS_H - -/* This stuff should go into a platform specific file */ -#define TCX0_CLK_RATE 19200 -#define TIMER_ENABLE 0 -#define TIMER_CLR_ON_MATCH 1 - -/* - * 8x50 HDD Specs 5-8. Simulator co-sim not fixed until - * release 1.1, and then it's "adjustable" and probably not defaulted. - */ -#define RTOS_TIMER_INT 3 -#ifdef CONFIG_HEXAGON_COMET -#define RTOS_TIMER_REGS_ADDR 0xAB000000UL -#endif -#define SLEEP_CLK_RATE 32000 - -#endif diff --git a/arch/hexagon/include/asm/timex.h b/arch/hexagon/include/asm/timex.h index 8d4ec76fceb4..dfe69e118b2b 100644 --- a/arch/hexagon/include/asm/timex.h +++ b/arch/hexagon/include/asm/timex.h @@ -7,11 +7,10 @@ #define _ASM_TIMEX_H #include -#include #include /* Using TCX0 as our clock. CLOCK_TICK_RATE scheduled to be removed. */ -#define CLOCK_TICK_RATE TCX0_CLK_RATE +#define CLOCK_TICK_RATE 19200 #define ARCH_HAS_READ_CURRENT_TIMER diff --git a/arch/hexagon/kernel/time.c b/arch/hexagon/kernel/time.c index feffe527ac92..febc95714d75 100644 --- a/arch/hexagon/kernel/time.c +++ b/arch/hexagon/kernel/time.c @@ -17,9 +17,10 @@ #include #include -#include #include +#define TIMER_ENABLE BIT(0) + /* * For the clocksource we need: * pcycle frequency (600MHz) @@ -33,6 +34,13 @@ cycles_t pcycle_freq_mhz; cycles_t thread_freq_mhz; cycles_t sleep_clk_freq; +/* + * 8x50 HDD Specs 5-8. Simulator co-sim not fixed until + * release 1.1, and then it's "adjustable" and probably not defaulted. + */ +#define RTOS_TIMER_INT 3 +#define RTOS_TIMER_REGS_ADDR 0xAB000000UL + static struct resource rtos_timer_resources[] = { { .start = RTOS_TIMER_REGS_ADDR, @@ -80,7 +88,7 @@ static int set_next_event(unsigned long delta, struct clock_event_device *evt) iowrite32(0, &rtos_timer->clear); iowrite32(delta, &rtos_timer->match); - iowrite32(1 << TIMER_ENABLE, &rtos_timer->enable); + iowrite32(TIMER_ENABLE, &rtos_timer->enable); return 0; } From eaac2f898974234b38db72aed573e68fa5a81f7e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 19 Nov 2021 16:43:34 -0800 Subject: [PATCH 326/336] hexagon: ignore vmlinux.lds After building allmodconfig, there is an untracked vmlinux.lds file in arch/hexagon/kernel: $ git ls-files . --exclude-standard --others arch/hexagon/kernel/vmlinux.lds Ignore it as all other architectures have. Link: https://lkml.kernel.org/r/20211115174250.1994179-4-nathan@kernel.org Signed-off-by: Nathan Chancellor Cc: Brian Cain Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/kernel/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/hexagon/kernel/.gitignore diff --git a/arch/hexagon/kernel/.gitignore b/arch/hexagon/kernel/.gitignore new file mode 100644 index 000000000000..c5f676c3c224 --- /dev/null +++ b/arch/hexagon/kernel/.gitignore @@ -0,0 +1 @@ +vmlinux.lds From 34dbc3aaf5d9e89ba6cc5e24add9458c21ab1950 Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Fri, 19 Nov 2021 16:43:37 -0800 Subject: [PATCH 327/336] mm: kmemleak: slob: respect SLAB_NOLEAKTRACE flag When kmemleak is enabled for SLOB, system does not boot and does not print anything to the console. At the very early stage in the boot process we hit infinite recursion from kmemleak_init() and eventually kernel crashes. kmemleak_init() specifies SLAB_NOLEAKTRACE for KMEM_CACHE(), but kmem_cache_create_usercopy() removes it because CACHE_CREATE_MASK is not valid for SLOB. Let's fix CACHE_CREATE_MASK and make kmemleak work with SLOB Link: https://lkml.kernel.org/r/20211115020850.3154366-1-rkovhaev@gmail.com Fixes: d8843922fba4 ("slab: Ignore internal flags in cache creation") Signed-off-by: Rustam Kovhaev Acked-by: Vlastimil Babka Reviewed-by: Muchun Song Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Catalin Marinas Cc: Greg Kroah-Hartman Cc: Glauber Costa Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab.h b/mm/slab.h index 58c01a34e5b8..56ad7eea3ddf 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -147,7 +147,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT) #else -#define SLAB_CACHE_FLAGS (0) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif /* Common flags available with current configuration */ From afe041c2d0febd83698b8b0164e6b3b1dfae0b66 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 19 Nov 2021 16:43:40 -0800 Subject: [PATCH 328/336] hugetlb: fix hugetlb cgroup refcounting during mremap When hugetlb_vm_op_open() is called during copy_vma(), we may take the reference to resv_map->css. Later, when clearing the reservation pointer of old_vma after transferring it to new_vma, we forget to drop the reference to resv_map->css. This leads to a reference leak of css. Fixes this by adding a check to drop reservation css reference in clear_vma_resv_huge_pages() Link: https://lkml.kernel.org/r/20211113154412.91134-1-minhquangbui99@gmail.com Fixes: 550a7d60bd5e35 ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Bui Quang Minh Reviewed-by: Mike Kravetz Reviewed-by: Mina Almasry Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb_cgroup.h | 12 ++++++++++++ mm/hugetlb.c | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index c137396129db..ba025ae27882 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -128,6 +128,13 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( css_get(resv_map->css); } +static inline void resv_map_put_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ + if (resv_map->css) + css_put(resv_map->css); +} + extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, @@ -211,6 +218,11 @@ static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( { } +static inline void resv_map_put_hugetlb_cgroup_uncharge_info( + struct resv_map *resv_map) +{ +} + static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e09159c957e3..3a2479003ddf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1037,8 +1037,10 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) */ struct resv_map *reservations = vma_resv_map(vma); - if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv_map_put_hugetlb_cgroup_uncharge_info(reservations); kref_put(&reservations->refs, resv_map_release); + } reset_vma_resv_huge_pages(vma); } From cc30042df6fcc82ea18acf0dace831503e60a0b7 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 19 Nov 2021 16:43:43 -0800 Subject: [PATCH 329/336] hugetlb, userfaultfd: fix reservation restore on userfaultfd error Currently in the is_continue case in hugetlb_mcopy_atomic_pte(), if we bail out using "goto out_release_unlock;" in the cases where idx >= size, or !huge_pte_none(), the code will detect that new_pagecache_page == false, and so call restore_reserve_on_error(). In this case I see restore_reserve_on_error() delete the reservation, and the following call to remove_inode_hugepages() will increment h->resv_hugepages causing a 100% reproducible leak. We should treat the is_continue case similar to adding a page into the pagecache and set new_pagecache_page to true, to indicate that there is no reservation to restore on the error path, and we need not call restore_reserve_on_error(). Rename new_pagecache_page to page_in_pagecache to make that clear. Link: https://lkml.kernel.org/r/20211117193825.378528-1-almasrymina@google.com Fixes: c7b1850dfb41 ("hugetlb: don't pass page cache pages to restore_reserve_on_error") Signed-off-by: Mina Almasry Reported-by: James Houghton Reviewed-by: Mike Kravetz Cc: Wei Xu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3a2479003ddf..f025d234522f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5736,13 +5736,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, int ret = -ENOMEM; struct page *page; int writable; - bool new_pagecache_page = false; + bool page_in_pagecache = false; if (is_continue) { ret = -EFAULT; page = find_lock_page(mapping, idx); if (!page) goto out; + page_in_pagecache = true; } else if (!*pagep) { /* If a page already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. @@ -5830,7 +5831,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ret = huge_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; - new_pagecache_page = true; + page_in_pagecache = true; } ptl = huge_pte_lockptr(h, dst_mm, dst_pte); @@ -5894,7 +5895,7 @@ out_release_unlock: if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: - if (!new_pagecache_page) + if (!page_in_pagecache) restore_reserve_on_error(h, dst_vma, dst_addr, page); put_page(page); goto out; From cab71f7495f7aa639ca4b8508f4c3e426e9cb2f7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Nov 2021 16:43:46 -0800 Subject: [PATCH 330/336] kasan: test: silence intentional read overflow warnings As done in commit d73dad4eb5ad ("kasan: test: bypass __alloc_size checks") for __write_overflow warnings, also silence some more cases that trip the __read_overflow warnings seen in 5.16-rc1[1]: In file included from include/linux/string.h:253, from include/linux/bitmap.h:10, from include/linux/cpumask.h:12, from include/linux/mm_types_task.h:14, from include/linux/mm_types.h:5, from include/linux/page-flags.h:13, from arch/arm64/include/asm/mte.h:14, from arch/arm64/include/asm/pgtable.h:12, from include/linux/pgtable.h:6, from include/linux/kasan.h:29, from lib/test_kasan.c:10: In function 'memcmp', inlined from 'kasan_memcmp' at lib/test_kasan.c:897:2: include/linux/fortify-string.h:263:25: error: call to '__read_overflow' declared with attribute error: detected read beyond size of object (1st parameter) 263 | __read_overflow(); | ^~~~~~~~~~~~~~~~~ In function 'memchr', inlined from 'kasan_memchr' at lib/test_kasan.c:872:2: include/linux/fortify-string.h:277:17: error: call to '__read_overflow' declared with attribute error: detected read beyond size of object (1st parameter) 277 | __read_overflow(); | ^~~~~~~~~~~~~~~~~ [1] http://kisskb.ellerman.id.au/kisskb/buildresult/14660585/log/ Link: https://lkml.kernel.org/r/20211116004111.3171781-1-keescook@chromium.org Fixes: d73dad4eb5ad ("kasan: test: bypass __alloc_size checks") Signed-off-by: Kees Cook Reviewed-by: Andrey Konovalov Acked-by: Marco Elver Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 67ed689a0b1b..0643573f8686 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -869,6 +869,7 @@ static void kasan_memchr(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = memchr(ptr, '1', size + 1)); @@ -894,6 +895,7 @@ static void kasan_memcmp(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset(arr, 0, sizeof(arr)); + OPTIMIZER_HIDE_VAR(size); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = memcmp(ptr, arr, size+1)); kfree(ptr); From db7a347b26fe05d2e8c115bb24dfd908d0252bc3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 19 Nov 2021 16:43:49 -0800 Subject: [PATCH 331/336] mm/damon/dbgfs: use '__GFP_NOWARN' for user-specified size buffer allocation Patch series "DAMON fixes". This patch (of 2): DAMON users can trigger below warning in '__alloc_pages()' by invoking write() to some DAMON debugfs files with arbitrarily high count argument, because DAMON debugfs interface allocates some buffers based on the user-specified 'count'. if (unlikely(order >= MAX_ORDER)) { WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); return NULL; } Because the DAMON debugfs interface code checks failure of the 'kmalloc()', this commit simply suppresses the warnings by adding '__GFP_NOWARN' flag. Link: https://lkml.kernel.org/r/20211110145758.16558-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211110145758.16558-2-sj@kernel.org Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index eccc14b34901..8ce1311ac533 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -32,7 +32,7 @@ static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) if (*ppos) return ERR_PTR(-EINVAL); - kbuf = kmalloc(count + 1, GFP_KERNEL); + kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); if (!kbuf) return ERR_PTR(-ENOMEM); @@ -133,7 +133,7 @@ static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, char *kbuf; ssize_t len; - kbuf = kmalloc(count, GFP_KERNEL); + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); if (!kbuf) return -ENOMEM; @@ -452,7 +452,7 @@ static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, char *kbuf; ssize_t len; - kbuf = kmalloc(count, GFP_KERNEL); + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); if (!kbuf) return -ENOMEM; @@ -578,7 +578,7 @@ static ssize_t dbgfs_kdamond_pid_read(struct file *file, char *kbuf; ssize_t len; - kbuf = kmalloc(count, GFP_KERNEL); + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); if (!kbuf) return -ENOMEM; From d78f3853f831eee46c6dbe726debf3be9e9c0d05 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 19 Nov 2021 16:43:52 -0800 Subject: [PATCH 332/336] mm/damon/dbgfs: fix missed use of damon_dbgfs_lock DAMON debugfs is supposed to protect dbgfs_ctxs, dbgfs_nr_ctxs, and dbgfs_dirs using damon_dbgfs_lock. However, some of the code is accessing the variables without the protection. This fixes it by protecting all such accesses. Link: https://lkml.kernel.org/r/20211110145758.16558-3-sj@kernel.org Fixes: 75c1c2b53c78 ("mm/damon/dbgfs: support multiple contexts") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 8ce1311ac533..9b520bb4a3e7 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -877,12 +877,14 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, return -EINVAL; } + mutex_lock(&damon_dbgfs_lock); if (!strncmp(kbuf, "on", count)) { int i; for (i = 0; i < dbgfs_nr_ctxs; i++) { if (damon_targets_empty(dbgfs_ctxs[i])) { kfree(kbuf); + mutex_unlock(&damon_dbgfs_lock); return -EINVAL; } } @@ -892,6 +894,7 @@ static ssize_t dbgfs_monitor_on_write(struct file *file, } else { ret = -EINVAL; } + mutex_unlock(&damon_dbgfs_lock); if (!ret) ret = count; @@ -944,15 +947,16 @@ static int __init __damon_dbgfs_init(void) static int __init damon_dbgfs_init(void) { - int rc; + int rc = -ENOMEM; + mutex_lock(&damon_dbgfs_lock); dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); if (!dbgfs_ctxs) - return -ENOMEM; + goto out; dbgfs_ctxs[0] = dbgfs_new_ctx(); if (!dbgfs_ctxs[0]) { kfree(dbgfs_ctxs); - return -ENOMEM; + goto out; } dbgfs_nr_ctxs = 1; @@ -963,6 +967,8 @@ static int __init damon_dbgfs_init(void) pr_err("%s: dbgfs init failed\n", __func__); } +out: + mutex_unlock(&damon_dbgfs_lock); return rc; } From 825c43f50e3aa811a291ffcb40e02fbf6d91ba86 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 19 Nov 2021 16:43:55 -0800 Subject: [PATCH 333/336] kmap_local: don't assume kmap PTEs are linear arrays in memory The kmap_local conversion broke the ARM architecture, because the new code assumes that all PTEs used for creating kmaps form a linear array in memory, and uses array indexing to look up the kmap PTE belonging to a certain kmap index. On ARM, this cannot work, not only because the PTE pages may be non-adjacent in memory, but also because ARM/!LPAE interleaves hardware entries and extended entries (carrying software-only bits) in a way that is not compatible with array indexing. Fortunately, this only seems to affect configurations with more than 8 CPUs, due to the way the per-CPU kmap slots are organized in memory. Work around this by permitting an architecture to set a Kconfig symbol that signifies that the kmap PTEs do not form a lineary array in memory, and so the only way to locate the appropriate one is to walk the page tables. Link: https://lore.kernel.org/linux-arm-kernel/20211026131249.3731275-1-ardb@kernel.org/ Link: https://lkml.kernel.org/r/20211116094737.7391-1-ardb@kernel.org Fixes: 2a15ba82fa6c ("ARM: highmem: Switch to generic kmap atomic") Signed-off-by: Ard Biesheuvel Reported-by: Quanyang Wang Reviewed-by: Linus Walleij Acked-by: Russell King (Oracle) Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + mm/Kconfig | 3 +++ mm/highmem.c | 32 +++++++++++++++++++++----------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f0f9e8bec83a..c2724d986fa0 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1463,6 +1463,7 @@ config HIGHMEM bool "High Memory Support" depends on MMU select KMAP_LOCAL + select KMAP_LOCAL_NON_LINEAR_PTE_ARRAY help The address space of ARM processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address diff --git a/mm/Kconfig b/mm/Kconfig index 068ce591a13a..28edafc820ad 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -890,6 +890,9 @@ config MAPPING_DIRTY_HELPERS config KMAP_LOCAL bool +config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY + bool + # struct io_mapping based helper. Selected by drivers that need them config IO_MAPPING bool diff --git a/mm/highmem.c b/mm/highmem.c index 88f65f155845..ca9fa8c92593 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -503,16 +503,22 @@ static inline int kmap_local_calc_idx(int idx) static pte_t *__kmap_pte; -static pte_t *kmap_get_pte(void) +static pte_t *kmap_get_pte(unsigned long vaddr, int idx) { + if (IS_ENABLED(CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY)) + /* + * Set by the arch if __kmap_pte[-idx] does not produce + * the correct entry. + */ + return virt_to_kpte(vaddr); if (!__kmap_pte) __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); - return __kmap_pte; + return &__kmap_pte[-idx]; } void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) { - pte_t pteval, *kmap_pte = kmap_get_pte(); + pte_t pteval, *kmap_pte; unsigned long vaddr; int idx; @@ -524,9 +530,10 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) preempt_disable(); idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte - idx))); + kmap_pte = kmap_get_pte(vaddr, idx); + BUG_ON(!pte_none(*kmap_pte)); pteval = pfn_pte(pfn, prot); - arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte - idx, pteval); + arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); arch_kmap_local_post_map(vaddr, pteval); current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; preempt_enable(); @@ -559,7 +566,7 @@ EXPORT_SYMBOL(__kmap_local_page_prot); void kunmap_local_indexed(void *vaddr) { unsigned long addr = (unsigned long) vaddr & PAGE_MASK; - pte_t *kmap_pte = kmap_get_pte(); + pte_t *kmap_pte; int idx; if (addr < __fix_to_virt(FIX_KMAP_END) || @@ -584,8 +591,9 @@ void kunmap_local_indexed(void *vaddr) idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + kmap_pte = kmap_get_pte(addr, idx); arch_kmap_local_pre_unmap(addr); - pte_clear(&init_mm, addr, kmap_pte - idx); + pte_clear(&init_mm, addr, kmap_pte); arch_kmap_local_post_unmap(addr); current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); kmap_local_idx_pop(); @@ -607,7 +615,7 @@ EXPORT_SYMBOL(kunmap_local_indexed); void __kmap_local_sched_out(void) { struct task_struct *tsk = current; - pte_t *kmap_pte = kmap_get_pte(); + pte_t *kmap_pte; int i; /* Clear kmaps */ @@ -634,8 +642,9 @@ void __kmap_local_sched_out(void) idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + kmap_pte = kmap_get_pte(addr, idx); arch_kmap_local_pre_unmap(addr); - pte_clear(&init_mm, addr, kmap_pte - idx); + pte_clear(&init_mm, addr, kmap_pte); arch_kmap_local_post_unmap(addr); } } @@ -643,7 +652,7 @@ void __kmap_local_sched_out(void) void __kmap_local_sched_in(void) { struct task_struct *tsk = current; - pte_t *kmap_pte = kmap_get_pte(); + pte_t *kmap_pte; int i; /* Restore kmaps */ @@ -663,7 +672,8 @@ void __kmap_local_sched_in(void) /* See comment in __kmap_local_sched_out() */ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); + kmap_pte = kmap_get_pte(addr, idx); + set_pte_at(&init_mm, addr, kmap_pte, pteval); arch_kmap_local_post_map(addr, pteval); } } From c1e63117711977cc4295b2ce73de29dd17066c82 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 19 Nov 2021 16:43:58 -0800 Subject: [PATCH 334/336] proc/vmcore: fix clearing user buffer by properly using clear_user() To clear a user buffer we cannot simply use memset, we have to use clear_user(). With a virtio-mem device that registers a vmcore_cb and has some logically unplugged memory inside an added Linux memory block, I can easily trigger a BUG by copying the vmcore via "cp": systemd[1]: Starting Kdump Vmcore Save Service... kdump[420]: Kdump is using the default log level(3). kdump[453]: saving to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[458]: saving vmcore-dmesg.txt to /sysroot/var/crash/127.0.0.1-2021-11-11-14:59:22/ kdump[465]: saving vmcore-dmesg.txt complete kdump[467]: saving vmcore BUG: unable to handle page fault for address: 00007f2374e01000 #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 7a523067 P4D 7a523067 PUD 7a528067 PMD 7a525067 PTE 800000007048f867 Oops: 0003 [#1] PREEMPT SMP NOPTI CPU: 0 PID: 468 Comm: cp Not tainted 5.15.0+ #6 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-27-g64f37cc530f1-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_from_oldmem.part.0.cold+0x1d/0x86 Code: ff ff ff e8 05 ff fe ff e9 b9 e9 7f ff 48 89 de 48 c7 c7 38 3b 60 82 e8 f1 fe fe ff 83 fd 08 72 3c 49 8d 7d 08 4c 89 e9 89 e8 <49> c7 45 00 00 00 00 00 49 c7 44 05 f8 00 00 00 00 48 83 e7 f81 RSP: 0018:ffffc9000073be08 EFLAGS: 00010212 RAX: 0000000000001000 RBX: 00000000002fd000 RCX: 00007f2374e01000 RDX: 0000000000000001 RSI: 00000000ffffdfff RDI: 00007f2374e01008 RBP: 0000000000001000 R08: 0000000000000000 R09: ffffc9000073bc50 R10: ffffc9000073bc48 R11: ffffffff829461a8 R12: 000000000000f000 R13: 00007f2374e01000 R14: 0000000000000000 R15: ffff88807bd421e8 FS: 00007f2374e12140(0000) GS:ffff88807f000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2374e01000 CR3: 000000007a4aa000 CR4: 0000000000350eb0 Call Trace: read_vmcore+0x236/0x2c0 proc_reg_read+0x55/0xa0 vfs_read+0x95/0x190 ksys_read+0x4f/0xc0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Some x86-64 CPUs have a CPU feature called "Supervisor Mode Access Prevention (SMAP)", which is used to detect wrong access from the kernel to user buffers like this: SMAP triggers a permissions violation on wrong access. In the x86-64 variant of clear_user(), SMAP is properly handled via clac()+stac(). To fix, properly use clear_user() when we're dealing with a user buffer. Link: https://lkml.kernel.org/r/20211112092750.6921-1-david@redhat.com Fixes: 997c136f518c ("fs/proc/vmcore.c: add hook to read_from_oldmem() to check for non-ram pages") Signed-off-by: David Hildenbrand Acked-by: Baoquan He Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Philipp Rudo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 30a3b66f475a..509f85148fee 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -154,9 +154,13 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (!pfn_is_ram(pfn)) - memset(buf, 0, nr_bytes); - else { + if (!pfn_is_ram(pfn)) { + tmp = 0; + if (!userbuf) + memset(buf, 0, nr_bytes); + else if (clear_user(buf, nr_bytes)) + tmp = -EFAULT; + } else { if (encrypted) tmp = copy_oldmem_page_encrypted(pfn, buf, nr_bytes, @@ -165,12 +169,12 @@ ssize_t read_from_oldmem(char *buf, size_t count, else tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - - if (tmp < 0) { - up_read(&vmcore_cb_rwsem); - return tmp; - } } + if (tmp < 0) { + up_read(&vmcore_cb_rwsem); + return tmp; + } + *ppos += nr_bytes; count -= nr_bytes; buf += nr_bytes; From 61eb495c83bf6ebde490992bf888ca15b9babc39 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 18 Nov 2021 10:26:21 -0800 Subject: [PATCH 335/336] pstore/blk: Use "%lu" to format unsigned long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 32-bit: fs/pstore/blk.c: In function ‘__best_effort_init’: include/linux/kern_levels.h:5:18: warning: format ‘%zu’ expects argument of type ‘size_t’, but argument 3 has type ‘long unsigned int’ [-Wformat=] 5 | #define KERN_SOH "\001" /* ASCII Start Of Header */ | ^~~~~~ include/linux/kern_levels.h:14:19: note: in expansion of macro ‘KERN_SOH’ 14 | #define KERN_INFO KERN_SOH "6" /* informational */ | ^~~~~~~~ include/linux/printk.h:373:9: note: in expansion of macro ‘KERN_INFO’ 373 | printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~~~~ fs/pstore/blk.c:314:3: note: in expansion of macro ‘pr_info’ 314 | pr_info("attached %s (%zu) (no dedicated panic_write!)\n", | ^~~~~~~ Cc: stable@vger.kernel.org Fixes: 7bb9557b48fcabaa ("pstore/blk: Use the normal block device I/O path") Signed-off-by: Geert Uytterhoeven Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210629103700.1935012-1-geert@linux-m68k.org Cc: Jens Axboe Reviewed-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/pstore/blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 5d1fbaffd66a..4ae0cfcd15f2 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -309,7 +309,7 @@ static int __init __best_effort_init(void) if (ret) kfree(best_effort_dev); else - pr_info("attached %s (%zu) (no dedicated panic_write!)\n", + pr_info("attached %s (%lu) (no dedicated panic_write!)\n", blkdev, best_effort_dev->zone.total_size); return ret; From 136057256686de39cc3a07c2e39ef6bc43003ff6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 21 Nov 2021 13:47:39 -0800 Subject: [PATCH 336/336] Linux 5.16-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9e12c14ea0fb..daf95a574b08 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Trick or Treat # *DOCUMENTATION*