From 2f2724630f7a8d582470f03ee56b96746767d270 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 22 May 2017 16:25:14 +1000 Subject: [PATCH 001/149] KVM: PPC: Book3S HV: Cope with host using large decrementer mode POWER9 introduces a new mode for the decrementer register, called large decrementer mode, in which the decrementer counter is 56 bits wide rather than 32, and reads are sign-extended rather than zero-extended. For the decrementer, this new mode is optional and controlled by a bit in the LPCR. The hypervisor decrementer (HDEC) is 56 bits wide on POWER9 and has no mode control. Since KVM code reads and writes the decrementer and hypervisor decrementer registers in a few places, it needs to be aware of the need to treat the decrementer value as a 64-bit quantity, and only do a 32-bit sign extension when large decrementer mode is not in effect. Similarly, the HDEC should always be treated as a 64-bit quantity on POWER9. We define a new EXTEND_HDEC macro to encapsulate the feature test for POWER9 and the sign extension. To enable the sign extension to be removed in large decrementer mode, we test the LPCR_LD bit in the host LPCR image stored in the struct kvm for the guest. If is set then large decrementer mode is enabled and the sign extension should be skipped. This is partly based on an earlier patch by Oliver O'Halloran. Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_interrupts.S | 12 +++++++++++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 23 +++++++++++++++++------ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 0fdc4a28970b..404deb512844 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -121,10 +121,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * Put whatever is in the decrementer into the * hypervisor decrementer. */ +BEGIN_FTR_SECTION + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_KVM(r5) + ld r9, KVM_HOST_LPCR(r6) + andis. r9, r9, LPCR_LD@h +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mfspr r8,SPRN_DEC mftb r7 - mtspr SPRN_HDEC,r8 +BEGIN_FTR_SECTION + /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */ + bne 32f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) extsw r8,r8 +32: mtspr SPRN_HDEC,r8 add r8,r8,r7 std r8,HSTATE_DECEXP(r13) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index bdb3f76ceb6b..e390b383b4d6 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -32,6 +32,12 @@ #include #include +/* Sign-extend HDEC if not on POWER9 */ +#define EXTEND_HDEC(reg) \ +BEGIN_FTR_SECTION; \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) /* Values in HSTATE_NAPPING(r13) */ @@ -214,6 +220,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ + /* HDEC may be larger than DEC for arch >= v3.00, but since the */ + /* HDEC value came from DEC in the first place, it will fit */ mfspr r3, SPRN_HDEC mtspr SPRN_DEC, r3 /* @@ -295,8 +303,9 @@ kvm_novcpu_wakeup: /* See if our timeslice has expired (HDEC is negative) */ mfspr r0, SPRN_HDEC + EXTEND_HDEC(r0) li r12, BOOK3S_INTERRUPT_HV_DECREMENTER - cmpwi r0, 0 + cmpdi r0, 0 blt kvm_novcpu_exit /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ @@ -390,8 +399,8 @@ kvm_secondary_got_guest: lbz r4, HSTATE_PTID(r13) cmpwi r4, 0 bne 63f - lis r6, 0x7fff - ori r6, r6, 0xffff + LOAD_REG_ADDR(r6, decrementer_max) + ld r6, 0(r6) mtspr SPRN_HDEC, r6 /* and set per-LPAR registers, if doing dynamic micro-threading */ ld r6, HSTATE_SPLIT_MODE(r13) @@ -968,7 +977,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) /* Check if HDEC expires soon */ mfspr r3, SPRN_HDEC - cmpwi r3, 512 /* 1 microsecond */ + EXTEND_HDEC(r3) + cmpdi r3, 512 /* 1 microsecond */ blt hdec_soon #ifdef CONFIG_KVM_XICS @@ -2366,12 +2376,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) mfspr r3, SPRN_DEC mfspr r4, SPRN_HDEC mftb r5 - cmpw r3, r4 + extsw r3, r3 + EXTEND_HDEC(r4) + cmpd r3, r4 ble 67f mtspr SPRN_DEC, r4 67: /* save expiry time of guest decrementer */ - extsw r3, r3 add r3, r3, r5 ld r4, HSTATE_KVM_VCPU(r13) ld r5, HSTATE_KVM_VCORE(r13) From 3def03441e53e29eed3afd9009974a5a42bf124a Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Sat, 20 May 2017 13:27:00 +0200 Subject: [PATCH 002/149] genksyms: add printf format attribute to error_with_pos() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compiling with -Wsuggest-attribute=format in HOSTCFLAGS, gcc complains that error_with_pos() may be declared with a printf format attribute: scripts/genksyms/genksyms.c:726:3: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format] vfprintf(stderr, fmt, args); ^~~~~~~~ This would allow catching printf-format errors at compile time in callers to error_with_pos(). Add this attribute. Signed-off-by: Nicolas Iooss Signed-off-by: Masahiro Yamada --- scripts/genksyms/genksyms.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/genksyms/genksyms.h b/scripts/genksyms/genksyms.h index 3bffdcaaa274..b724a0290c75 100644 --- a/scripts/genksyms/genksyms.h +++ b/scripts/genksyms/genksyms.h @@ -75,7 +75,7 @@ struct string_list *copy_list_range(struct string_list *start, int yylex(void); int yyparse(void); -void error_with_pos(const char *, ...); +void error_with_pos(const char *, ...) __attribute__ ((format(printf, 1, 2))); /*----------------------------------------------------------------------*/ #define xmalloc(size) ({ void *__ptr = malloc(size); \ From 501e7a4689378f8b1690089bfdd4f1e12ec22903 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 2 Jun 2017 11:21:34 -0400 Subject: [PATCH 003/149] NFSv4.2: Don't send mode again in post-EXCLUSIVE4_1 SETATTR with umask Now that we have umask support, we shouldn't re-send the mode in a SETATTR following an exclusive CREATE, or we risk having the same problem fixed in commit 5334c5bdac92 ("NFS: Send attributes in OPEN request for NFS4_CREATE_EXCLUSIVE4_1"), which is that files with S_ISGID will have that bit stripped away. Signed-off-by: Benjamin Coddington Fixes: dff25ddb4808 ("nfs: add support for the umask attribute") Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c08c46a3b8cd..4e43fd1270c4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2589,7 +2589,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, /* Except MODE, it seems harmless of setting twice. */ if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && - attrset[1] & FATTR4_WORD1_MODE) + (attrset[1] & FATTR4_WORD1_MODE || + attrset[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) From cbf52a3e6a8a92beec6e0c70abf4111cd8f8faf7 Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 5 Jun 2017 13:59:15 +0200 Subject: [PATCH 004/149] tags: honor COMPILED_SOURCE with apart output directory When the kernel is compiled with an "O=" argument, the object files are not in the source tree, but in the build tree. This patch fixes O= build by looking for object files in the build tree. Fixes: 923e02ecf3f8 ("scripts/tags.sh: Support compiled source") Signed-off-by: Robert Jarzmik Signed-off-by: Masahiro Yamada --- scripts/tags.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/tags.sh b/scripts/tags.sh index d661f2f3ef61..d23dcbf17457 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -106,6 +106,7 @@ all_compiled_sources() case "$i" in *.[cS]) j=${i/\.[cS]/\.o} + j="${j#$tree}" if [ -e $j ]; then echo $i fi From 27fef9f8ecb0495d302deba210606a32e54db37a Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 6 Jun 2017 09:46:33 +0100 Subject: [PATCH 005/149] mfd: arizona: Fix typo using hard-coded register A hardcoded register is accidentally used instead of the register address passed into the function. Correct this and use the appropriate variable. This would cause minor issues on wm5102, but all other devices using this driver would have been unaffected. Fixes: commit ef84f885e037 ("mfd: arizona: Refactor arizona_poll_reg") Reported-by: Andrzej Hajda Signed-off-by: Charles Keepax Signed-off-by: Lee Jones --- drivers/mfd/arizona-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c index 75488e65cd96..8d46e3ad9529 100644 --- a/drivers/mfd/arizona-core.c +++ b/drivers/mfd/arizona-core.c @@ -245,8 +245,7 @@ static int arizona_poll_reg(struct arizona *arizona, int ret; ret = regmap_read_poll_timeout(arizona->regmap, - ARIZONA_INTERRUPT_RAW_STATUS_5, val, - ((val & mask) == target), + reg, val, ((val & mask) == target), ARIZONA_REG_POLL_DELAY_US, timeout_ms * 1000); if (ret) From 9ba26a7283f56100eb08a2df48f17da600f60d52 Mon Sep 17 00:00:00 2001 From: Cao jin Date: Tue, 6 Jun 2017 17:07:53 +0800 Subject: [PATCH 006/149] Kbuild: tiny correction on `make help` The help info of `make C=1` is little confusing, make it clear. Signed-off-by: Cao jin Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 63e10bd4f14a..1e2ae78f8ba5 100644 --- a/Makefile +++ b/Makefile @@ -1437,7 +1437,7 @@ help: @echo ' make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build' @echo ' make V=2 [targets] 2 => give reason for rebuild of target' @echo ' make O=dir [targets] Locate all output files in "dir", including .config' - @echo ' make C=1 [targets] Check all c source with $$CHECK (sparse by default)' + @echo ' make C=1 [targets] Check re-compiled c source with $$CHECK (sparse by default)' @echo ' make C=2 [targets] Force check of all c source with $$CHECK' @echo ' make RECORDMCOUNT_WARN=1 [targets] Warn about ignored mcount sections' @echo ' make W=n [targets] Enable extra gcc checks, n=1,2,3 where' From b81f884a547b5c264c13fdfaa3b65cf994bf1dcf Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 1 Jun 2017 14:57:56 +0800 Subject: [PATCH 007/149] xfrm: fix xfrm_dev_event() missing when compile without CONFIG_XFRM_OFFLOAD In commit d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") we make xfrm_device.o only compiled when enable option CONFIG_XFRM_OFFLOAD. But this will make xfrm_dev_event() missing if we only enable default XFRM options. Then if we set down and unregister an interface with IPsec on it. there will no xfrm_garbage_collect(), which will cause dev usage count hold and get error like: unregister_netdevice: waiting for to become free. Usage count = 4 Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Hangbin Liu Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 ++----- net/xfrm/Makefile | 3 +-- net/xfrm/xfrm_device.c | 2 ++ 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 7e7e2b0d2915..62f5a259e597 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1850,8 +1850,9 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) } #endif -#ifdef CONFIG_XFRM_OFFLOAD void __net_init xfrm_dev_init(void); + +#ifdef CONFIG_XFRM_OFFLOAD int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); @@ -1877,10 +1878,6 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) } } #else -static inline void __net_init xfrm_dev_init(void) -{ -} - static inline int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) { return 0; diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index abf81b329dc1..55b2ac300995 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -4,8 +4,7 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ - xfrm_sysctl.o xfrm_replay.o -obj-$(CONFIG_XFRM_OFFLOAD) += xfrm_device.o + xfrm_sysctl.o xfrm_replay.o xfrm_device.o obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 574e6f32f94f..5aba03685d7d 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -22,6 +22,7 @@ #include #include +#ifdef CONFIG_XFRM_OFFLOAD int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) { int err; @@ -137,6 +138,7 @@ ok: return true; } EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); +#endif int xfrm_dev_register(struct net_device *dev) { From 73d4e580ccc5c3e05cea002f18111f66c9c07034 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 2 Jun 2017 20:00:17 -0700 Subject: [PATCH 008/149] target: Fix kref->refcount underflow in transport_cmd_finish_abort This patch fixes a se_cmd->cmd_kref underflow during CMD_T_ABORTED when a fabric driver drops it's second reference from below the target_core_tmr.c based callers of transport_cmd_finish_abort(). Recently with the conversion of kref to refcount_t, this bug was manifesting itself as: [705519.601034] refcount_t: underflow; use-after-free. [705519.604034] INFO: NMI handler (kgdb_nmi_handler) took too long to run: 20116.512 msecs [705539.719111] ------------[ cut here ]------------ [705539.719117] WARNING: CPU: 3 PID: 26510 at lib/refcount.c:184 refcount_sub_and_test+0x33/0x51 Since the original kref atomic_t based kref_put() didn't check for underflow and only invoked the final callback when zero was reached, this bug did not manifest in practice since all se_cmd memory is using preallocated tags. To address this, go ahead and propigate the existing return from transport_put_cmd() up via transport_cmd_finish_abort(), and change transport_cmd_finish_abort() + core_tmr_handle_tas_abort() callers to only do their local target_put_sess_cmd() if necessary. Reported-by: Bart Van Assche Tested-by: Bart Van Assche Cc: Mike Christie Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Himanshu Madhani Cc: Sagi Grimberg Cc: stable@vger.kernel.org # 3.14+ Tested-by: Gary Guo Tested-by: Chu Yuan Lin Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_internal.h | 2 +- drivers/target/target_core_tmr.c | 16 ++++++++-------- drivers/target/target_core_transport.c | 9 ++++++--- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index 9ab7090f7c83..0912de7c0cf8 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -136,7 +136,7 @@ int init_se_kmem_caches(void); void release_se_kmem_caches(void); u32 scsi_get_new_index(scsi_index_t); void transport_subsystem_check_init(void); -void transport_cmd_finish_abort(struct se_cmd *, int); +int transport_cmd_finish_abort(struct se_cmd *, int); unsigned char *transport_dump_cmd_direction(struct se_cmd *); void transport_dump_dev_state(struct se_device *, char *, int *); void transport_dump_dev_info(struct se_device *, struct se_lun *, diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index dce1e1b47316..13f47bf4d16b 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -75,7 +75,7 @@ void core_tmr_release_req(struct se_tmr_req *tmr) kfree(tmr); } -static void core_tmr_handle_tas_abort(struct se_cmd *cmd, int tas) +static int core_tmr_handle_tas_abort(struct se_cmd *cmd, int tas) { unsigned long flags; bool remove = true, send_tas; @@ -91,7 +91,7 @@ static void core_tmr_handle_tas_abort(struct se_cmd *cmd, int tas) transport_send_task_abort(cmd); } - transport_cmd_finish_abort(cmd, remove); + return transport_cmd_finish_abort(cmd, remove); } static int target_check_cdb_and_preempt(struct list_head *list, @@ -184,8 +184,8 @@ void core_tmr_abort_task( cancel_work_sync(&se_cmd->work); transport_wait_for_tasks(se_cmd); - transport_cmd_finish_abort(se_cmd, true); - target_put_sess_cmd(se_cmd); + if (!transport_cmd_finish_abort(se_cmd, true)) + target_put_sess_cmd(se_cmd); printk("ABORT_TASK: Sending TMR_FUNCTION_COMPLETE for" " ref_tag: %llu\n", ref_tag); @@ -281,8 +281,8 @@ static void core_tmr_drain_tmr_list( cancel_work_sync(&cmd->work); transport_wait_for_tasks(cmd); - transport_cmd_finish_abort(cmd, 1); - target_put_sess_cmd(cmd); + if (!transport_cmd_finish_abort(cmd, 1)) + target_put_sess_cmd(cmd); } } @@ -380,8 +380,8 @@ static void core_tmr_drain_state_list( cancel_work_sync(&cmd->work); transport_wait_for_tasks(cmd); - core_tmr_handle_tas_abort(cmd, tas); - target_put_sess_cmd(cmd); + if (!core_tmr_handle_tas_abort(cmd, tas)) + target_put_sess_cmd(cmd); } } diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 6025935036c9..f1b3a46bdcaf 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -651,9 +651,10 @@ static void transport_lun_remove_cmd(struct se_cmd *cmd) percpu_ref_put(&lun->lun_ref); } -void transport_cmd_finish_abort(struct se_cmd *cmd, int remove) +int transport_cmd_finish_abort(struct se_cmd *cmd, int remove) { bool ack_kref = (cmd->se_cmd_flags & SCF_ACK_KREF); + int ret = 0; if (cmd->se_cmd_flags & SCF_SE_LUN_CMD) transport_lun_remove_cmd(cmd); @@ -665,9 +666,11 @@ void transport_cmd_finish_abort(struct se_cmd *cmd, int remove) cmd->se_tfo->aborted_task(cmd); if (transport_cmd_check_stop_to_fabric(cmd)) - return; + return 1; if (remove && ack_kref) - transport_put_cmd(cmd); + ret = transport_put_cmd(cmd); + + return ret; } static void target_complete_failure_work(struct work_struct *work) From 105fa2f44e504c830697b0c794822112d79808dc Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 3 Jun 2017 05:35:47 -0700 Subject: [PATCH 009/149] iscsi-target: Fix delayed logout processing greater than SECONDS_FOR_LOGOUT_COMP This patch fixes a BUG() in iscsit_close_session() that could be triggered when iscsit_logout_post_handler() execution from within tx thread context was not run for more than SECONDS_FOR_LOGOUT_COMP (15 seconds), and the TCP connection didn't already close before then forcing tx thread context to automatically exit. This would manifest itself during explicit logout as: [33206.974254] 1 connection(s) still exist for iSCSI session to iqn.1993-08.org.debian:01:3f5523242179 [33206.980184] INFO: NMI handler (kgdb_nmi_handler) took too long to run: 2100.772 msecs [33209.078643] ------------[ cut here ]------------ [33209.078646] kernel BUG at drivers/target/iscsi/iscsi_target.c:4346! Normally when explicit logout attempt fails, the tx thread context exits and iscsit_close_connection() from rx thread context does the extra cleanup once it detects conn->conn_logout_remove has not been cleared by the logout type specific post handlers. To address this special case, if the logout post handler in tx thread context detects conn->tx_thread_active has already been cleared, simply return and exit in order for existing iscsit_close_connection() logic from rx thread context do failed logout cleanup. Reported-by: Bart Van Assche Tested-by: Bart Van Assche Cc: Mike Christie Cc: Hannes Reinecke Cc: Sagi Grimberg Cc: stable@vger.kernel.org # 3.14+ Tested-by: Gary Guo Tested-by: Chu Yuan Lin Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 0d8f81591bed..c0254516b380 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -4423,8 +4423,11 @@ static void iscsit_logout_post_handler_closesession( * always sleep waiting for RX/TX thread shutdown to complete * within iscsit_close_connection(). */ - if (!conn->conn_transport->rdma_shutdown) + if (!conn->conn_transport->rdma_shutdown) { sleep = cmpxchg(&conn->tx_thread_active, true, false); + if (!sleep) + return; + } atomic_set(&conn->conn_logout_remove, 0); complete(&conn->conn_logout_comp); @@ -4440,8 +4443,11 @@ static void iscsit_logout_post_handler_samecid( { int sleep = 1; - if (!conn->conn_transport->rdma_shutdown) + if (!conn->conn_transport->rdma_shutdown) { sleep = cmpxchg(&conn->tx_thread_active, true, false); + if (!sleep) + return; + } atomic_set(&conn->conn_logout_remove, 0); complete(&conn->conn_logout_comp); From abb85a9b512e8ca7ad04a5a8a6db9664fe644974 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 7 Jun 2017 20:29:50 -0700 Subject: [PATCH 010/149] iscsi-target: Reject immediate data underflow larger than SCSI transfer length When iscsi WRITE underflow occurs there are two different scenarios that can happen. Normally in practice, when an EDTL vs. SCSI CDB TRANSFER LENGTH underflow is detected, the iscsi immediate data payload is the smaller SCSI CDB TRANSFER LENGTH. That is, when a host fabric LLD is using a fixed size EDTL for a specific control CDB, the SCSI CDB TRANSFER LENGTH and actual SCSI payload ends up being smaller than EDTL. In iscsi, this means the received iscsi immediate data payload matches the smaller SCSI CDB TRANSFER LENGTH, because there is no more SCSI payload to accept beyond SCSI CDB TRANSFER LENGTH. However, it's possible for a malicous host to send a WRITE underflow where EDTL is larger than SCSI CDB TRANSFER LENGTH, but incoming iscsi immediate data actually matches EDTL. In the wild, we've never had a iscsi host environment actually try to do this. For this special case, it's wrong to truncate part of the control CDB payload and continue to process the command during underflow when immediate data payload received was larger than SCSI CDB TRANSFER LENGTH, so go ahead and reject and drop the bogus payload as a defensive action. Note this potential bug was originally relaxed by the following for allowing WRITE underflow in MSFT FCP host environments: commit c72c5250224d475614a00c1d7e54a67f77cd3410 Author: Roland Dreier Date: Wed Jul 22 15:08:18 2015 -0700 target: allow underflow/overflow for PR OUT etc. commands Cc: Roland Dreier Cc: Mike Christie Cc: Hannes Reinecke Cc: Martin K. Petersen Cc: # v4.3+ Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index c0254516b380..3fdca2cdd8da 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -1279,6 +1279,18 @@ iscsit_get_immediate_data(struct iscsi_cmd *cmd, struct iscsi_scsi_req *hdr, */ if (dump_payload) goto after_immediate_data; + /* + * Check for underflow case where both EDTL and immediate data payload + * exceeds what is presented by CDB's TRANSFER LENGTH, and what has + * already been set in target_cmd_size_check() as se_cmd->data_length. + * + * For this special case, fail the command and dump the immediate data + * payload. + */ + if (cmd->first_burst_len > cmd->se_cmd.data_length) { + cmd->sense_reason = TCM_INVALID_CDB_FIELD; + goto after_immediate_data; + } immed_ret = iscsit_handle_immediate_data(cmd, hdr, cmd->first_burst_len); From ff85a1a80e00349dc7783c8dc4d6233d9a709283 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 21 May 2017 11:44:47 +0200 Subject: [PATCH 011/149] kconfig: Check for libncurses before menuconfig There is a check and a nice user-friendly message when the curses library is not present on the system and the user wants to do "make menuconfig". It doesn't get issued, though. Instead, we fail the build when mconf.c doesn't find the curses.h header: HOSTCC scripts/kconfig/mconf.o In file included from scripts/kconfig/mconf.c:23:0: scripts/kconfig/lxdialog/dialog.h:38:20: fatal error: curses.h: No such file or directory #include CURSES_LOC ^ compilation terminated. Make that check a prerequisite to mconf so that the user sees the error message instead: $ make menuconfig *** Unable to find the ncurses libraries or the *** required header files. *** 'make menuconfig' requires the ncurses libraries. *** *** Install ncurses (ncurses-devel) and try again. *** scripts/kconfig/Makefile:203: recipe for target 'scripts/kconfig/dochecklxdialog' failed make[1]: *** [scripts/kconfig/dochecklxdialog] Error 1 Makefile:548: recipe for target 'menuconfig' failed make: *** [menuconfig] Error 2 Signed-off-by: Borislav Petkov Signed-off-by: Masahiro Yamada --- scripts/kconfig/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 90a091b6ae4d..eb8144643b78 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -196,7 +196,7 @@ clean-files += config.pot linux.pot # Check that we have the required ncurses stuff installed for lxdialog (menuconfig) PHONY += $(obj)/dochecklxdialog -$(addprefix $(obj)/,$(lxdialog)): $(obj)/dochecklxdialog +$(addprefix $(obj)/, mconf.o $(lxdialog)): $(obj)/dochecklxdialog $(obj)/dochecklxdialog: $(Q)$(CONFIG_SHELL) $(check-lxdialog) -check $(HOSTCC) $(HOST_EXTRACFLAGS) $(HOSTLOADLIBES_mconf) From 138437f591dd9a42d53c6fed1a3c85e02678851c Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Sun, 11 Jun 2017 09:44:20 +0800 Subject: [PATCH 012/149] xfrm: move xfrm_garbage_collect out of xfrm_policy_flush Now we will force to do garbage collection if any policy removed in xfrm_policy_flush(). But during xfrm_net_exit(). We call flow_cache_fini() first and set set fc->percpu to NULL. Then after we call xfrm_policy_fini() -> frxm_policy_flush() -> flow_cache_flush(), we will get NULL pointer dereference when check percpu_empty. The code path looks like: flow_cache_fini() - fc->percpu = NULL xfrm_policy_fini() - xfrm_policy_flush() - xfrm_garbage_collect() - flow_cache_flush() - flow_cache_percpu_empty() - fcp = per_cpu_ptr(fc->percpu, cpu) To reproduce, just add ipsec in netns and then remove the netns. v2: As Xin Long suggested, since only two other places need to call it. move xfrm_garbage_collect() outside xfrm_policy_flush(). v3: Fix subject mismatch after v2 fix. Fixes: 35db06912189 ("xfrm: do the garbage collection after flushing policy") Signed-off-by: Hangbin Liu Reviewed-by: Xin Long Signed-off-by: Steffen Klassert --- net/key/af_key.c | 2 ++ net/xfrm/xfrm_policy.c | 4 ---- net/xfrm/xfrm_user.c | 1 + 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index 512dc43d0ce6..5103f92e2eb0 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2755,6 +2755,8 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad int err, err2; err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); + if (!err) + xfrm_garbage_collect(net); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ed4e52d95172..643a18f72032 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1006,10 +1006,6 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); - - if (cnt) - xfrm_garbage_collect(net); - return err; } EXPORT_SYMBOL(xfrm_policy_flush); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 38614df33ec8..86116e9aaf3d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2027,6 +2027,7 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; return err; } + xfrm_garbage_collect(net); c.data.type = type; c.event = nlh->nlmsg_type; From 3db1200ca21f3c63c9044185dc5762ef996848cb Mon Sep 17 00:00:00 2001 From: Frank Rowand Date: Fri, 9 Jun 2017 17:26:32 -0700 Subject: [PATCH 013/149] clocksource/drivers/arm_arch_timer: Fix read and iounmap of incorrect variable Fix boot warning 'Trying to vfree() nonexistent vm area' from arch_timer_mem_of_init(). Refactored code attempts to read and iounmap using address frame instead of address ioremap(frame->cntbase). Fixes: c389d701dfb70 ("clocksource: arm_arch_timer: split MMIO timer probing.") Signed-off-by: Frank Rowand Reviewed-by: Fu Wei Acked-by: Marc Zyngier Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 4bed671e490e..8b5c30062d99 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -1209,9 +1209,9 @@ arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame) return 0; } - rate = readl_relaxed(frame + CNTFRQ); + rate = readl_relaxed(base + CNTFRQ); - iounmap(frame); + iounmap(base); return rate; } From 60ce2858514ed9ccaf00dc7e9f4dc219537e9855 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 9 Jun 2017 10:14:53 +0100 Subject: [PATCH 014/149] ARM: 8680/1: boot/compressed: fix inappropriate Thumb2 mnemonic for __nop Commit 06a4b6d009a1 ("ARM: 8677/1: boot/compressed: fix decompressor header layout for v7-M") fixed an issue in the layout of the header of the compressed kernel image that was caused by the assembler emitting narrow opcodes for 'mov r0, r0', and for this reason, the mnemonic was updated to use the W() macro, which will append the .w suffix (which forces a wide encoding) if required, i.e., when building the kernel in Thumb2 mode. However, this failed to take into account that on Thumb2 kernels built for CPUs that are also ARM capable, the entry point is entered in ARM mode, and so the instructions emitted here will be ARM instructions that only exist in a wide encoding to begin with, which is why the assembler rejects the .w suffix here and aborts the build with the following message: head.S: Assembler messages: head.S:132: Error: width suffixes are invalid in ARM mode -- `mov.w r0,r0' So replace the W(mov) with separate ARM and Thumb2 instructions, where the latter will only be used for THUMB2_ONLY builds. Fixes: 06a4b6d009a1 ("ARM: 8677/1: boot/compressed: fix decompressor ...") Reported-by: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/boot/compressed/efi-header.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/compressed/efi-header.S b/arch/arm/boot/compressed/efi-header.S index 3f7d1b74c5e0..a17ca8d78656 100644 --- a/arch/arm/boot/compressed/efi-header.S +++ b/arch/arm/boot/compressed/efi-header.S @@ -17,7 +17,8 @@ @ there. .inst 'M' | ('Z' << 8) | (0x1310 << 16) @ tstne r0, #0x4d000 #else - W(mov) r0, r0 + AR_CLASS( mov r0, r0 ) + M_CLASS( nop.w ) #endif .endm From bbeedfda8eee0b17ea16e4e157c596095458676a Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 9 Jun 2017 15:28:18 +0100 Subject: [PATCH 015/149] ARM: 8681/1: make VMSPLIT_3G_OPT depends on !ARM_LPAE When both enable CONFIG_ARM_LPAE=y and CONFIG_VMSPLIT_3G_OPT=y, which means use PAGE_OFFSET=0xB0000000 with ARM_LPAE, the kernel will boot fail and stop after uncompressed: Starting kernel ... Uart base = 0x20001000 watchdog reg = 0x20013000 dtb addr = 0x80840308 Uncompressing Linux... done, booting the kernel. For ARM_LPAE only support 3:1, 2:2, 1:3 split of TTBR1, which mention in: http://elinux.org/images/6/6a/Elce11_marinas.pdf - p16 So we should make VMSPLIT_3G_OPT depends on !ARM_LPAE to avoid trigger this bug. Acked-by: Nicolas Pitre Signed-off-by: Yisheng Xie Signed-off-by: Russell King --- arch/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4c1a35f15838..c0fcab6a5504 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1416,6 +1416,7 @@ choice config VMSPLIT_3G bool "3G/1G user/kernel split" config VMSPLIT_3G_OPT + depends on !ARM_LPAE bool "3G/1G user/kernel split (for full 1G low memory)" config VMSPLIT_2G bool "2G/2G user/kernel split" From d360a687d99577110c181e67ebfb9a1b6fed63a2 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 12 Jun 2017 13:35:52 +0100 Subject: [PATCH 016/149] ARM: 8682/1: V7M: Set cacheid iff DminLine or IminLine is nonzero Cache support is optional feature in M-class cores, thus DminLine or IminLine of Cache Type Register is zero if caches are not implemented, but we check the whole CTR which has other features encoded there. Let's be more precise and check for DminLine and IminLine of CTR before we set cacheid. Signed-off-by: Vladimir Murzin Signed-off-by: Russell King --- arch/arm/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 32e1a9513dc7..4e80bf7420d4 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -315,7 +315,7 @@ static void __init cacheid_init(void) if (arch >= CPU_ARCH_ARMv6) { unsigned int cachetype = read_cpuid_cachetype(); - if ((arch == CPU_ARCH_ARMv7M) && !cachetype) { + if ((arch == CPU_ARCH_ARMv7M) && !(cachetype & 0xf000f)) { cacheid = 0; } else if ((cachetype & (7 << 29)) == 4 << 29) { /* ARMv7 register format */ From 2ad50606f847a902303a5364b7cad64bdd6246f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ondrej=20Mosn=C3=A1=C4=8Dek?= Date: Mon, 5 Jun 2017 17:52:39 +0200 Subject: [PATCH 017/149] dm integrity: reject mappings too large for device dm-integrity would successfully create mappings with the number of sectors greater than the provided data sector count. Attempts to read sectors of this mapping that were beyond the provided data sector count would then yield run-time messages of the form "device-mapper: integrity: Too big sector number: ...". Fix this by emitting an error when the requested mapping size is bigger than the provided data sector count. Signed-off-by: Ondrej Mosnacek Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 7910bfe50da4..4ab10cf718c9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3040,6 +3040,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "The device is too small"; goto bad; } + if (ti->len > ic->provided_data_sectors) { + r = -EINVAL; + ti->error = "Not enough provided sectors for requested mapping size"; + goto bad; + } if (!buffer_sectors) buffer_sectors = 1; From ca8efa1df1d15a1795a2da57f9f6aada6ed6b946 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 6 Jun 2017 16:47:22 +1000 Subject: [PATCH 018/149] KVM: PPC: Book3S HV: Context-switch EBB registers properly This adds code to save the values of three SPRs (special-purpose registers) used by userspace to control event-based branches (EBBs), which are essentially interrupts that get delivered directly to userspace. These registers are loaded up with guest values when entering the guest, and their values are saved when exiting the guest, but we were not saving the host values and restoring them before going back to userspace. On POWER8 this would only affect userspace programs which explicitly request the use of EBBs and also use the KVM_RUN ioctl, since the only source of EBBs on POWER8 is the PMU, and there is an explicit enable bit in the PMU registers (and those PMU registers do get properly context-switched between host and guest). On POWER9 there is provision for externally-generated EBBs, and these are not subject to the control in the PMU registers. Since these registers only affect userspace, we can save them when we first come in from userspace and restore them before returning to userspace, rather than saving/restoring the host values on every guest entry/exit. Similarly, we don't need to worry about their values on offline secondary threads since they execute in the context of the idle task, which never executes in userspace. Fixes: b005255e12a3 ("KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs", 2014-01-08) Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 42b7a4fd57d9..400a5992b121 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2907,6 +2907,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; int srcu_idx; + unsigned long ebb_regs[3] = {}; /* shut up GCC */ if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -2934,6 +2935,13 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) flush_all_to_thread(current); + /* Save userspace EBB register values */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + ebb_regs[0] = mfspr(SPRN_EBBHR); + ebb_regs[1] = mfspr(SPRN_EBBRR); + ebb_regs[2] = mfspr(SPRN_BESCR); + } + vcpu->arch.wqp = &vcpu->arch.vcore->wq; vcpu->arch.pgdir = current->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; @@ -2960,6 +2968,13 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) } } while (is_kvmppc_resume_guest(r)); + /* Restore userspace EBB register values */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + mtspr(SPRN_EBBHR, ebb_regs[0]); + mtspr(SPRN_EBBRR, ebb_regs[1]); + mtspr(SPRN_BESCR, ebb_regs[2]); + } + out: vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&vcpu->kvm->arch.vcpus_running); From 459fa246d8fa4a543ed9a3331f15c8fe1caf9937 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sun, 11 Jun 2017 15:22:10 +1000 Subject: [PATCH 019/149] clocksource: Explicitly include linux/clocksource.h when needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kbuild test robot reported errors in these files when doing an ia64 allmodconfig build. drivers/clocksource/timer-sun5i.c:52:21: error: field 'clksrc' has incomplete type struct clocksource clksrc; ^~~~~~ drivers/clocksource/cadence_ttc_timer.c:92:21: error: field 'cs' has incomplete type struct clocksource cs; ^~ (and many more errors for these files) Cc: Michal Simek Cc: "Sören Brinkmann" Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: Maxime Ripard Cc: Chen-Yu Tsai Reported-by: kbuild test robot Signed-off-by: Stephen Rothwell Acked-by: Michal Simek Signed-off-by: Daniel Lezcano --- drivers/clocksource/cadence_ttc_timer.c | 1 + drivers/clocksource/timer-sun5i.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/clocksource/cadence_ttc_timer.c b/drivers/clocksource/cadence_ttc_timer.c index 44e5e951583b..8e64b8460f11 100644 --- a/drivers/clocksource/cadence_ttc_timer.c +++ b/drivers/clocksource/cadence_ttc_timer.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index 2e9c830ae1cd..c4656c4d44a6 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include From 46464411307746e6297a034a9983a22c9dfc5a0c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 18 May 2017 17:28:47 +0200 Subject: [PATCH 020/149] xen/blkback: fix disconnect while I/Os in flight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today disconnecting xen-blkback is broken in case there are still I/Os in flight: xen_blkif_disconnect() will bail out early without releasing all resources in the hope it will be called again when the last request has terminated. This, however, won't happen as xen_blkif_free() won't be called on termination of the last running request: xen_blkif_put() won't decrement the blkif refcnt to 0 as xen_blkif_disconnect() didn't finish before thus some xen_blkif_put() calls in xen_blkif_disconnect() didn't happen. To solve this deadlock xen_blkif_disconnect() and xen_blkif_alloc_rings() shouldn't use xen_blkif_put() and xen_blkif_get() but use some other way to do their accounting of resources. This at once fixes another error in xen_blkif_disconnect(): when it returned early with -EBUSY for another ring than 0 it would call xen_blkif_put() again for already handled rings on a subsequent call. This will lead to inconsistencies in the refcnt handling. Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross Tested-by: Steven Haigh Acked-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/common.h | 1 + drivers/block/xen-blkback/xenbus.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index dea61f6ab8cb..638597b17a38 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -281,6 +281,7 @@ struct xen_blkif_ring { wait_queue_head_t wq; atomic_t inflight; + bool active; /* One thread per blkif ring. */ struct task_struct *xenblkd; unsigned int waiting_reqs; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 1f3dfaa54d87..998915174bb8 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -159,7 +159,7 @@ static int xen_blkif_alloc_rings(struct xen_blkif *blkif) init_waitqueue_head(&ring->shutdown_wq); ring->blkif = blkif; ring->st_print = jiffies; - xen_blkif_get(blkif); + ring->active = true; } return 0; @@ -249,6 +249,9 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) struct xen_blkif_ring *ring = &blkif->rings[r]; unsigned int i = 0; + if (!ring->active) + continue; + if (ring->xenblkd) { kthread_stop(ring->xenblkd); wake_up(&ring->shutdown_wq); @@ -296,7 +299,7 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) BUG_ON(ring->free_pages_num != 0); BUG_ON(ring->persistent_gnt_c != 0); WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); - xen_blkif_put(blkif); + ring->active = false; } blkif->nr_ring_pages = 0; /* From 71df1d7ccad1c36f7321d6b3b48f2ea42681c363 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 18 May 2017 17:28:48 +0200 Subject: [PATCH 021/149] xen/blkback: don't free be structure too early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The be structure must not be freed when freeing the blkif structure isn't done. Otherwise a use-after-free of be when unmapping the ring used for communicating with the frontend will occur in case of a late call of xenblk_disconnect() (e.g. due to an I/O still active when trying to disconnect). Signed-off-by: Juergen Gross Tested-by: Steven Haigh Acked-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 998915174bb8..4cdf0490983e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -315,9 +315,10 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) static void xen_blkif_free(struct xen_blkif *blkif) { - - xen_blkif_disconnect(blkif); + WARN_ON(xen_blkif_disconnect(blkif)); xen_vbd_free(&blkif->vbd); + kfree(blkif->be->mode); + kfree(blkif->be); /* Make sure everything is drained before shutting down */ kmem_cache_free(xen_blkif_cachep, blkif); @@ -514,8 +515,6 @@ static int xen_blkbk_remove(struct xenbus_device *dev) xen_blkif_put(be->blkif); } - kfree(be->mode); - kfree(be); return 0; } From a24fa22ce22ae302b3bf8f7008896d52d5d57b8d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 18 May 2017 17:28:49 +0200 Subject: [PATCH 022/149] xen/blkback: don't use xen_blkif_get() in xen-blkback kthread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to use xen_blkif_get()/xen_blkif_put() in the kthread of xen-blkback. Thread stopping is synchronous and using the blkif reference counting in the kthread will avoid to ever let the reference count drop to zero at the end of an I/O running concurrent to disconnecting and multiple rings. Setting ring->xenblkd to NULL after stopping the kthread isn't needed as the kthread does this already. Signed-off-by: Juergen Gross Tested-by: Steven Haigh Acked-by: Roger Pau Monné Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 3 --- drivers/block/xen-blkback/xenbus.c | 1 - 2 files changed, 4 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 726c32e35db9..6b14c509f3c7 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -609,8 +609,6 @@ int xen_blkif_schedule(void *arg) unsigned long timeout; int ret; - xen_blkif_get(blkif); - set_freezable(); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -665,7 +663,6 @@ purge_gnt_list: print_stats(ring); ring->xenblkd = NULL; - xen_blkif_put(blkif); return 0; } diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 4cdf0490983e..792da683e70d 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -255,7 +255,6 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) if (ring->xenblkd) { kthread_stop(ring->xenblkd); wake_up(&ring->shutdown_wq); - ring->xenblkd = NULL; } /* The above kthread_stop() guarantees that at this point we From 089bc0143f489bd3a4578bdff5f4ca68fb26f341 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 13 Jun 2017 16:28:27 -0400 Subject: [PATCH 023/149] xen-blkback: don't leak stack data via response ring Rather than constructing a local structure instance on the stack, fill the fields directly on the shared ring, just like other backends do. Build on the fact that all response structure flavors are actually identical (the old code did make this assumption too). This is XSA-216. Cc: stable@vger.kernel.org Signed-off-by: Jan Beulich Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 23 ++++++++++++----------- drivers/block/xen-blkback/common.h | 25 +++++-------------------- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 6b14c509f3c7..0e824091a12f 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1433,34 +1433,35 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st) { - struct blkif_response resp; + struct blkif_response *resp; unsigned long flags; union blkif_back_rings *blk_rings; int notify; - resp.id = id; - resp.operation = op; - resp.status = st; - spin_lock_irqsave(&ring->blk_ring_lock, flags); blk_rings = &ring->blk_rings; /* Place on the response ring for the relevant domain. */ switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: - memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), - &resp, sizeof(resp)); + resp = RING_GET_RESPONSE(&blk_rings->native, + blk_rings->native.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_32: - memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), - &resp, sizeof(resp)); + resp = RING_GET_RESPONSE(&blk_rings->x86_32, + blk_rings->x86_32.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_64: - memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), - &resp, sizeof(resp)); + resp = RING_GET_RESPONSE(&blk_rings->x86_64, + blk_rings->x86_64.rsp_prod_pvt); break; default: BUG(); } + + resp->id = id; + resp->operation = op; + resp->status = st; + blk_rings->common.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); spin_unlock_irqrestore(&ring->blk_ring_lock, flags); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 638597b17a38..ecb35fe8ca8d 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -75,9 +75,8 @@ extern unsigned int xenblk_max_queues; struct blkif_common_request { char dummy; }; -struct blkif_common_response { - char dummy; -}; + +/* i386 protocol version */ struct blkif_x86_32_request_rw { uint8_t nr_segments; /* number of segments */ @@ -129,14 +128,6 @@ struct blkif_x86_32_request { } u; } __attribute__((__packed__)); -/* i386 protocol version */ -#pragma pack(push, 4) -struct blkif_x86_32_response { - uint64_t id; /* copied from request */ - uint8_t operation; /* copied from request */ - int16_t status; /* BLKIF_RSP_??? */ -}; -#pragma pack(pop) /* x86_64 protocol version */ struct blkif_x86_64_request_rw { @@ -193,18 +184,12 @@ struct blkif_x86_64_request { } u; } __attribute__((__packed__)); -struct blkif_x86_64_response { - uint64_t __attribute__((__aligned__(8))) id; - uint8_t operation; /* copied from request */ - int16_t status; /* BLKIF_RSP_??? */ -}; - DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, - struct blkif_common_response); + struct blkif_response); DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, - struct blkif_x86_32_response); + struct blkif_response __packed); DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, - struct blkif_x86_64_response); + struct blkif_response); union blkif_back_rings { struct blkif_back_ring native; From e79b0006c45c9b0b22f3ea54ff6e256b34c1f208 Mon Sep 17 00:00:00 2001 From: Megha Dey Date: Wed, 14 Jun 2017 09:51:56 +0530 Subject: [PATCH 024/149] ALSA: hda - Add Coffelake PCI ID Coffelake is another Intel part, so need to add PCI ID for it. Signed-off-by: Megha Dey Signed-off-by: Subhransu S. Prusty Acked-by: Vinod Koul Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 1770f085c2a6..e3c696c46a21 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -371,9 +371,10 @@ enum { #define IS_KBL_H(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa2f0) #define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98) #define IS_GLK(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x3198) +#define IS_CFL(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa348) #define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci)) || \ IS_KBL(pci) || IS_KBL_LP(pci) || IS_KBL_H(pci) || \ - IS_GLK(pci) + IS_GLK(pci) || IS_CFL(pci) static char *driver_short_names[] = { [AZX_DRIVER_ICH] = "HDA Intel", @@ -2378,6 +2379,9 @@ static const struct pci_device_id azx_ids[] = { /* Kabylake-H */ { PCI_DEVICE(0x8086, 0xa2f0), .driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_SKYLAKE }, + /* Coffelake */ + { PCI_DEVICE(0x8086, 0xa348), + .driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_SKYLAKE}, /* Broxton-P(Apollolake) */ { PCI_DEVICE(0x8086, 0x5a98), .driver_data = AZX_DRIVER_PCH | AZX_DCAPS_INTEL_BROXTON }, From 1e3d0c2c70cd3edb5deed186c5f5c75f2b84a633 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Jun 2017 13:34:05 +0300 Subject: [PATCH 025/149] xfrm: Oops on error in pfkey_msg2xfrm_state() There are some missing error codes here so we accidentally return NULL instead of an error pointer. It results in a NULL pointer dereference. Fixes: df71837d5024 ("[LSM-IPSec]: Security association restriction.") Signed-off-by: Dan Carpenter Signed-off-by: Steffen Klassert --- net/key/af_key.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index 5103f92e2eb0..2b82adac5d8e 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1168,8 +1168,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, if (key) keysize = (key->sadb_key_bits + 7) / 8; x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); - if (!x->aalg) + if (!x->aalg) { + err = -ENOMEM; goto out; + } strcpy(x->aalg->alg_name, a->name); x->aalg->alg_key_len = 0; if (key) { @@ -1188,8 +1190,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, goto out; } x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); - if (!x->calg) + if (!x->calg) { + err = -ENOMEM; goto out; + } strcpy(x->calg->alg_name, a->name); x->props.calgo = sa->sadb_sa_encrypt; } else { @@ -1203,8 +1207,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, if (key) keysize = (key->sadb_key_bits + 7) / 8; x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); - if (!x->ealg) + if (!x->ealg) { + err = -ENOMEM; goto out; + } strcpy(x->ealg->alg_name, a->name); x->ealg->alg_key_len = 0; if (key) { @@ -1249,8 +1255,10 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, struct xfrm_encap_tmpl *natt; x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); - if (!x->encap) + if (!x->encap) { + err = -ENOMEM; goto out; + } natt = x->encap; n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; From e747f64336fc15e1c823344942923195b800aa1e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Jun 2017 13:35:37 +0300 Subject: [PATCH 026/149] xfrm: NULL dereference on allocation failure The default error code in pfkey_msg2xfrm_state() is -ENOBUFS. We added a new call to security_xfrm_state_alloc() which sets "err" to zero so there several places where we can return ERR_PTR(0) if kmalloc() fails. The caller is expecting error pointers so it leads to a NULL dereference. Fixes: df71837d5024 ("[LSM-IPSec]: Security association restriction.") Signed-off-by: Dan Carpenter Signed-off-by: Steffen Klassert --- net/key/af_key.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/key/af_key.c b/net/key/af_key.c index 2b82adac5d8e..b1432b668033 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1157,6 +1157,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, goto out; } + err = -ENOBUFS; key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (sa->sadb_sa_auth) { int keysize = 0; From 4130b28f568688f79539b732797a1dc04b048442 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 6 Jun 2017 13:55:42 +0200 Subject: [PATCH 027/149] s390/ipl: revert Load Normal semantics for LPAR CCW-type re-IPL This reverts the two commits 7afbeb6df2aa ("s390/ipl: always use load normal for CCW-type re-IPL") 0f7451ff3ab8 ("s390/ipl: use load normal for LPAR re-ipl") The two commits did not take into account that behavior of standby memory changes fundamentally if the re-IPL method is changed from Load Clear to Load Normal. In case of the old re-IPL clear method all memory that was initially in standby state will be put into standby state again within the re-IPL process. Or in other words: memory that was brought online before a re-IPL will be offline again after a reboot. Given that we use different re-IPL methods depending on the hypervisor and CCW-type vs SCSI re-IPL it is not easy to tell in advance when and why memory will stay online or will be offline after a re-IPL. This does also have other side effects, since memory that is online from the beginning will be in ZONE_NORMAL by default vs ZONE_MOVABLE for memory that is offline. Therefore, before the change, a user could online and offline memory easily since standby memory was always in ZONE_NORMAL. After the change, and a re-IPL, this depended on which memory parts were online before the re-IPL. From a usability point of view the current behavior is more than suboptimal. Therefore revert these changes until we have a better solution and get back to a consistent behavior. The bad thing about this is that the time required for a re-IPL will be significantly increased for configurations with several 100GB or 1TB of memory. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ipl.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index e545ffe5155a..8e622bb52f7a 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -564,8 +564,6 @@ static struct kset *ipl_kset; static void __ipl_run(void *unused) { - if (MACHINE_IS_LPAR && ipl_info.type == IPL_TYPE_CCW) - diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); diag308(DIAG308_LOAD_CLEAR, NULL); if (MACHINE_IS_VM) __cpcmd("IPL", NULL, 0, NULL); @@ -1088,10 +1086,7 @@ static void __reipl_run(void *unused) break; case REIPL_METHOD_CCW_DIAG: diag308(DIAG308_SET, reipl_block_ccw); - if (MACHINE_IS_LPAR) - diag308(DIAG308_LOAD_NORMAL_DUMP, NULL); - else - diag308(DIAG308_LOAD_CLEAR, NULL); + diag308(DIAG308_LOAD_CLEAR, NULL); break; case REIPL_METHOD_FCP_RW_DIAG: diag308(DIAG308_SET, reipl_block_fcp); From 2deaeaf102d692cb6f764123b1df7aa118a8e97c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 14 Jun 2017 16:20:32 +0200 Subject: [PATCH 028/149] ALSA: pcm: Don't treat NULL chmap as a fatal error The standard PCM chmap helper callbacks treat the NULL info->chmap as a fatal error and spews the kernel warning with stack trace when CONFIG_SND_DEBUG is on. This was OK, originally it was supposed to be always static and non-NULL. But, as the recent addition of Intel LPE audio driver shows, the chmap content may vary dynamically, and it can be even NULL when disconnected. The user still sees the kernel warning unnecessarily. For clearing such a confusion, this patch simply removes the snd_BUG_ON() in each place, just returns an error without warning. Cc: # v4.11+ Signed-off-by: Takashi Iwai --- sound/core/pcm_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 5088d4b8db22..009e6c98754e 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -2492,7 +2492,7 @@ static int pcm_chmap_ctl_get(struct snd_kcontrol *kcontrol, struct snd_pcm_substream *substream; const struct snd_pcm_chmap_elem *map; - if (snd_BUG_ON(!info->chmap)) + if (!info->chmap) return -EINVAL; substream = snd_pcm_chmap_substream(info, idx); if (!substream) @@ -2524,7 +2524,7 @@ static int pcm_chmap_ctl_tlv(struct snd_kcontrol *kcontrol, int op_flag, unsigned int __user *dst; int c, count = 0; - if (snd_BUG_ON(!info->chmap)) + if (!info->chmap) return -EINVAL; if (size < 8) return -ENOMEM; From 4c3bb4ccd074e1a0552078c0bf94c662367a1658 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Jun 2017 15:43:17 +1000 Subject: [PATCH 029/149] KVM: PPC: Book3S HV: Restore critical SPRs to host values on guest exit This restores several special-purpose registers (SPRs) to sane values on guest exit that were missed before. TAR and VRSAVE are readable and writable by userspace, and we need to save and restore them to prevent the guest from potentially affecting userspace execution (not that TAR or VRSAVE are used by any known program that run uses the KVM_RUN ioctl). We save/restore these in kvmppc_vcpu_run_hv() rather than on every guest entry/exit. FSCR affects userspace execution in that it can prohibit access to certain facilities by userspace. We restore it to the normal value for the task on exit from the KVM_RUN ioctl. IAMR is normally 0, and is restored to 0 on guest exit. However, with a radix host on POWER9, it is set to a value that prevents the kernel from executing user-accessible memory. On POWER9, we save IAMR on guest entry and restore it on guest exit to the saved value rather than 0. On POWER8 we continue to set it to 0 on guest exit. PSPB is normally 0. We restore it to 0 on guest exit to prevent userspace taking advantage of the guest having set it non-zero (which would allow userspace to set its SMT priority to high). UAMOR is normally 0. We restore it to 0 on guest exit to prevent the AMR from being used as a covert channel between userspace processes, since the AMR is not context-switched at present. Fixes: b005255e12a3 ("KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs", 2014-01-08) Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 11 +++++++++-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 ++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 400a5992b121..a963762a031f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2908,6 +2908,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) int r; int srcu_idx; unsigned long ebb_regs[3] = {}; /* shut up GCC */ + unsigned long user_tar = 0; + unsigned int user_vrsave; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -2935,12 +2937,14 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) flush_all_to_thread(current); - /* Save userspace EBB register values */ + /* Save userspace EBB and other register values */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) { ebb_regs[0] = mfspr(SPRN_EBBHR); ebb_regs[1] = mfspr(SPRN_EBBRR); ebb_regs[2] = mfspr(SPRN_BESCR); + user_tar = mfspr(SPRN_TAR); } + user_vrsave = mfspr(SPRN_VRSAVE); vcpu->arch.wqp = &vcpu->arch.vcore->wq; vcpu->arch.pgdir = current->mm->pgd; @@ -2968,12 +2972,15 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) } } while (is_kvmppc_resume_guest(r)); - /* Restore userspace EBB register values */ + /* Restore userspace EBB and other register values */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) { mtspr(SPRN_EBBHR, ebb_regs[0]); mtspr(SPRN_EBBRR, ebb_regs[1]); mtspr(SPRN_BESCR, ebb_regs[2]); + mtspr(SPRN_TAR, user_tar); + mtspr(SPRN_FSCR, current->thread.fscr); } + mtspr(SPRN_VRSAVE, user_vrsave); out: vcpu->arch.state = KVMPPC_VCPU_NOTREADY; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index e390b383b4d6..4e4390564276 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -558,6 +558,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) #define STACK_SLOT_TID (112-16) #define STACK_SLOT_PSSCR (112-24) #define STACK_SLOT_PID (112-32) +#define STACK_SLOT_IAMR (112-40) .global kvmppc_hv_entry kvmppc_hv_entry: @@ -758,9 +759,11 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR mfspr r7, SPRN_PID + mfspr r8, SPRN_IAMR std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) std r7, STACK_SLOT_PID(r1) + std r8, STACK_SLOT_IAMR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION @@ -1515,11 +1518,12 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) * set by the guest could disrupt the host. */ li r0, 0 - mtspr SPRN_IAMR, r0 mtspr SPRN_CIABR, r0 mtspr SPRN_DAWRX, r0 + mtspr SPRN_PSPB, r0 mtspr SPRN_WORT, r0 BEGIN_FTR_SECTION + mtspr SPRN_IAMR, r0 mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 @@ -1535,6 +1539,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) std r6,VCPU_UAMOR(r9) li r6,0 mtspr SPRN_AMR,r6 + mtspr SPRN_UAMOR, r6 /* Switch DSCR back to host value */ mfspr r8, SPRN_DSCR @@ -1683,9 +1688,11 @@ BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) ld r7, STACK_SLOT_PID(r1) + ld r8, STACK_SLOT_IAMR(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 mtspr SPRN_PID, r7 + mtspr SPRN_IAMR, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT From 46a704f8409f79fd66567ad3f8a7304830a84293 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 15 Jun 2017 16:10:27 +1000 Subject: [PATCH 030/149] KVM: PPC: Book3S HV: Preserve userspace HTM state properly If userspace attempts to call the KVM_RUN ioctl when it has hardware transactional memory (HTM) enabled, the values that it has put in the HTM-related SPRs TFHAR, TFIAR and TEXASR will get overwritten by guest values. To fix this, we detect this condition and save those SPR values in the thread struct, and disable HTM for the task. If userspace goes to access those SPRs or the HTM facility in future, a TM-unavailable interrupt will occur and the handler will reload those SPRs and re-enable HTM. If userspace has started a transaction and suspended it, we would currently lose the transactional state in the guest entry path and would almost certainly get a "TM Bad Thing" interrupt, which would cause the host to crash. To avoid this, we detect this case and return from the KVM_RUN ioctl with an EINVAL error, with the KVM exit reason set to KVM_EXIT_FAIL_ENTRY. Fixes: b005255e12a3 ("KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs", 2014-01-08) Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a963762a031f..fd4d978d5257 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2916,6 +2916,27 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINVAL; } + /* + * Don't allow entry with a suspended transaction, because + * the guest entry/exit code will lose it. + * If the guest has TM enabled, save away their TM-related SPRs + * (they will get restored by the TM unavailable interrupt). + */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs && + (current->thread.regs->msr & MSR_TM)) { + if (MSR_TM_ACTIVE(current->thread.regs->msr)) { + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + run->fail_entry.hardware_entry_failure_reason = 0; + return -EINVAL; + } + current->thread.tm_tfhar = mfspr(SPRN_TFHAR); + current->thread.tm_tfiar = mfspr(SPRN_TFIAR); + current->thread.tm_texasr = mfspr(SPRN_TEXASR); + current->thread.regs->msr &= ~MSR_TM; + } +#endif + kvmppc_core_prepare_to_enter(vcpu); /* No need to go into the guest when all we'll do is come back out */ From cd15fb64ee56192760ad5c1e2ad97a65e735b18b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 15 Jun 2017 08:39:15 -0400 Subject: [PATCH 031/149] Revert "dm mirror: use all available legs on multiple failures" This reverts commit 12a7cf5ba6c776a2621d8972c7d42e8d3d959d20. This commit apparently attempted to fix an issue that didn't really exist, furthermore: this commit is the source of deadlocks and crashes seen in multiple cases related to failing the primary mirror dev while syncing. Reported-by: Jonathan Brassow Signed-off-by: Mike Snitzer --- drivers/md/dm-raid1.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index e61c45047c25..4da8858856fb 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -145,6 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list) struct dm_raid1_bio_record { struct mirror *m; + /* if details->bi_bdev == NULL, details were not saved */ struct dm_bio_details details; region_t write_region; }; @@ -1198,6 +1199,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) struct dm_raid1_bio_record *bio_record = dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); + bio_record->details.bi_bdev = NULL; + if (rw == WRITE) { /* Save region for mirror_end_io() handler */ bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio); @@ -1256,12 +1259,22 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) } if (error == -EOPNOTSUPP) - return error; + goto out; if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD)) - return error; + goto out; if (unlikely(error)) { + if (!bio_record->details.bi_bdev) { + /* + * There wasn't enough memory to record necessary + * information for a retry or there was no other + * mirror in-sync. + */ + DMERR_LIMIT("Mirror read failed."); + return -EIO; + } + m = bio_record->m; DMERR("Mirror read failed from %s. Trying alternative device.", @@ -1277,6 +1290,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) bd = &bio_record->details; dm_bio_restore(bd, bio); + bio_record->details.bi_bdev = NULL; bio->bi_error = 0; queue_bio(ms, bio, rw); @@ -1285,6 +1299,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) DMERR("All replicated volumes dead, failing I/O"); } +out: + bio_record->details.bi_bdev = NULL; + return error; } From 91ebcd1b97aea87172e8ae8871f1f328b9fa9c05 Mon Sep 17 00:00:00 2001 From: Aurelien Jacquiot Date: Thu, 15 Jun 2017 11:41:17 +0200 Subject: [PATCH 032/149] MAINTAINERS: update email address for C6x maintainer Aurelien has moved. Signed-off-by: Aurelien Jacquiot Signed-off-by: Mark Salter --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 38d3e4ed7208..5714bc7f6fe7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2945,7 +2945,7 @@ F: sound/pci/oxygen/ C6X ARCHITECTURE M: Mark Salter -M: Aurelien Jacquiot +M: Aurelien Jacquiot L: linux-c6x-dev@linux-c6x.org W: http://www.linux-c6x.org/wiki/index.php/Main_Page S: Maintained From 7ceaa6dcd8c6f59588428cec37f3c8093dd1011f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 16 Jun 2017 11:53:19 +1000 Subject: [PATCH 033/149] KVM: PPC: Book3S HV: Save/restore host values of debug registers At present, HV KVM on POWER8 and POWER9 machines loses any instruction or data breakpoint set in the host whenever a guest is run. Instruction breakpoints are currently only used by xmon, but ptrace and the perf_event subsystem can set data breakpoints as well as xmon. To fix this, we save the host values of the debug registers (CIABR, DAWR and DAWRX) before entering the guest and restore them on exit. To provide space to save them in the stack frame, we expand the stack frame allocated by kvmppc_hv_entry() from 112 to 144 bytes. Fixes: b005255e12a3 ("KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs", 2014-01-08) Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 45 ++++++++++++++++++------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 4e4390564276..4888dd494604 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -44,6 +44,17 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define NAPPING_CEDE 1 #define NAPPING_NOVCPU 2 +/* Stack frame offsets for kvmppc_hv_entry */ +#define SFS 144 +#define STACK_SLOT_TRAP (SFS-4) +#define STACK_SLOT_TID (SFS-16) +#define STACK_SLOT_PSSCR (SFS-24) +#define STACK_SLOT_PID (SFS-32) +#define STACK_SLOT_IAMR (SFS-40) +#define STACK_SLOT_CIABR (SFS-48) +#define STACK_SLOT_DAWR (SFS-56) +#define STACK_SLOT_DAWRX (SFS-64) + /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. @@ -328,10 +339,10 @@ kvm_novcpu_exit: bl kvmhv_accumulate_time #endif 13: mr r3, r12 - stw r12, 112-4(r1) + stw r12, STACK_SLOT_TRAP(r1) bl kvmhv_commence_exit nop - lwz r12, 112-4(r1) + lwz r12, STACK_SLOT_TRAP(r1) b kvmhv_switch_to_host /* @@ -554,12 +565,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * * *****************************************************************************/ -/* Stack frame offsets */ -#define STACK_SLOT_TID (112-16) -#define STACK_SLOT_PSSCR (112-24) -#define STACK_SLOT_PID (112-32) -#define STACK_SLOT_IAMR (112-40) - .global kvmppc_hv_entry kvmppc_hv_entry: @@ -575,7 +580,7 @@ kvmppc_hv_entry: */ mflr r0 std r0, PPC_LR_STKOFF(r1) - stdu r1, -112(r1) + stdu r1, -SFS(r1) /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) @@ -765,6 +770,14 @@ BEGIN_FTR_SECTION std r7, STACK_SLOT_PID(r1) std r8, STACK_SLOT_IAMR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + mfspr r5, SPRN_CIABR + mfspr r6, SPRN_DAWR + mfspr r7, SPRN_DAWRX + std r5, STACK_SLOT_CIABR(r1) + std r6, STACK_SLOT_DAWR(r1) + std r7, STACK_SLOT_DAWRX(r1) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION /* Set partition DABR */ @@ -1518,8 +1531,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) * set by the guest could disrupt the host. */ li r0, 0 - mtspr SPRN_CIABR, r0 - mtspr SPRN_DAWRX, r0 mtspr SPRN_PSPB, r0 mtspr SPRN_WORT, r0 BEGIN_FTR_SECTION @@ -1684,6 +1695,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) ptesync /* Restore host values of some registers */ +BEGIN_FTR_SECTION + ld r5, STACK_SLOT_CIABR(r1) + ld r6, STACK_SLOT_DAWR(r1) + ld r7, STACK_SLOT_DAWRX(r1) + mtspr SPRN_CIABR, r5 + mtspr SPRN_DAWR, r6 + mtspr SPRN_DAWRX, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) @@ -1836,8 +1855,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) - ld r0, 112+PPC_LR_STKOFF(r1) - addi r1, r1, 112 + ld r0, SFS+PPC_LR_STKOFF(r1) + addi r1, r1, SFS mtlr r0 blr From 3d3efb68c19e539f0535c93a5258c1299270215f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 6 Jun 2017 14:35:30 +1000 Subject: [PATCH 034/149] KVM: PPC: Book3S HV: Ignore timebase offset on POWER9 DD1 POWER9 DD1 has an erratum where writing to the TBU40 register, which is used to apply an offset to the timebase, can cause the timebase to lose counts. This results in the timebase on some CPUs getting out of sync with other CPUs, which then results in misbehaviour of the timekeeping code. To work around the problem, we make KVM ignore the timebase offset for all guests on POWER9 DD1 machines. This means that live migration cannot be supported on POWER9 DD1 machines. Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index fd4d978d5257..8d1a365b8edc 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1486,6 +1486,14 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; case KVM_REG_PPC_TB_OFFSET: + /* + * POWER9 DD1 has an erratum where writing TBU40 causes + * the timebase to lose ticks. So we don't let the + * timebase offset be changed on P9 DD1. (It is + * initialized to zero.) + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + break; /* round up to multiple of 2^24 */ vcpu->arch.vcore->tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24); From a9f8553e935f26cb5447f67e280946b0923cd2dc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 1 Jun 2017 16:18:15 +0530 Subject: [PATCH 035/149] powerpc/kprobes: Pause function_graph tracing during jprobes handling This fixes a crash when function_graph and jprobes are used together. This is essentially commit 237d28db036e ("ftrace/jprobes/x86: Fix conflict between jprobes and function graph tracing"), but for powerpc. Jprobes breaks function_graph tracing since the jprobe hook needs to use jprobe_return(), which never returns back to the hook, but instead to the original jprobe'd function. The solution is to momentarily pause function_graph tracing before invoking the jprobe hook and re-enable it when returning back to the original jprobe'd function. Fixes: 6794c78243bf ("powerpc64: port of the function graph tracer") Cc: stable@vger.kernel.org # v2.6.30+ Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index fc4343514bed..5075a4d6f1d7 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -617,6 +617,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); #endif + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); + return 1; } NOKPROBE_SYMBOL(setjmp_pre_handler); @@ -642,6 +651,8 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) * saved regs... */ memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); preempt_enable_no_resched(); return 1; } From a4979a7e71eb8da976cbe4a0a1fa50636e76b04f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 1 Jun 2017 16:18:16 +0530 Subject: [PATCH 036/149] powerpc/ftrace: Pass the correct stack pointer for DYNAMIC_FTRACE_WITH_REGS For DYNAMIC_FTRACE_WITH_REGS, we should be passing-in the original set of registers in pt_regs, to capture the state _before_ ftrace_caller. However, we are instead passing the stack pointer *after* allocating a stack frame in ftrace_caller. Fix this by saving the proper value of r1 in pt_regs. Also, use SAVE_10GPRS() to simplify the code. Fixes: 153086644fd1 ("powerpc/ftrace: Add support for -mprofile-kernel ftrace ABI") Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- .../powerpc/kernel/trace/ftrace_64_mprofile.S | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 7c933a99f5d5..fa0921410fa4 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -45,10 +45,14 @@ _GLOBAL(ftrace_caller) stdu r1,-SWITCH_FRAME_SIZE(r1) /* Save all gprs to pt_regs */ - SAVE_8GPRS(0,r1) - SAVE_8GPRS(8,r1) - SAVE_8GPRS(16,r1) - SAVE_8GPRS(24,r1) + SAVE_GPR(0, r1) + SAVE_10GPRS(2, r1) + SAVE_10GPRS(12, r1) + SAVE_10GPRS(22, r1) + + /* Save previous stack pointer (r1) */ + addi r8, r1, SWITCH_FRAME_SIZE + std r8, GPR1(r1) /* Load special regs for save below */ mfmsr r8 @@ -103,10 +107,10 @@ ftrace_call: #endif /* Restore gprs */ - REST_8GPRS(0,r1) - REST_8GPRS(8,r1) - REST_8GPRS(16,r1) - REST_8GPRS(24,r1) + REST_GPR(0,r1) + REST_10GPRS(2,r1) + REST_10GPRS(12,r1) + REST_10GPRS(22,r1) /* Restore possibly modified LR */ ld r0, _LINK(r1) From c05b8c4474c03026aaa7f8872e78369f69f1bb08 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 1 Jun 2017 16:18:17 +0530 Subject: [PATCH 037/149] powerpc/kprobes: Skip livepatch_handler() for jprobes ftrace_caller() depends on a modified regs->nip to detect if a certain function has been livepatched. However, with KPROBES_ON_FTRACE, it is possible for regs->nip to have been modified by the kprobes pre_handler (jprobes, for instance). In this case, we do not want to invoke the livepatch_handler so as not to consume the livepatch stack. To distinguish between the two (kprobes and livepatch), we check if there is an active kprobe on the current function. If there is, then we know for sure that it must have modified the NIP as we don't support livepatching a kprobe'd function. In this case, we simply skip the livepatch_handler and branch to the new NIP. Otherwise, the livepatch_handler is invoked. Fixes: ead514d5fb30 ("powerpc/kprobes: Add support for KPROBES_ON_FTRACE") Signed-off-by: Naveen N. Rao Reviewed-by: Masami Hiramatsu Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kprobes.h | 1 + arch/powerpc/kernel/kprobes.c | 6 +++ .../powerpc/kernel/trace/ftrace_64_mprofile.S | 39 ++++++++++++++++--- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index a83821f33ea3..8814a7249ceb 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -103,6 +103,7 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_handler(struct pt_regs *regs); extern int kprobe_post_handler(struct pt_regs *regs); +extern int is_current_kprobe_addr(unsigned long addr); #ifdef CONFIG_KPROBES_ON_FTRACE extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 5075a4d6f1d7..01addfb0ed0a 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -43,6 +43,12 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; +int is_current_kprobe_addr(unsigned long addr) +{ + struct kprobe *p = kprobe_running(); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; +} + bool arch_within_kprobe_blacklist(unsigned long addr) { return (addr >= (unsigned long)__kprobes_text_start && diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index fa0921410fa4..c98e90b4ea7b 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -99,13 +99,39 @@ ftrace_call: bl ftrace_stub nop - /* Load ctr with the possibly modified NIP */ - ld r3, _NIP(r1) - mtctr r3 + /* Load the possibly modified NIP */ + ld r15, _NIP(r1) + #ifdef CONFIG_LIVEPATCH - cmpd r14,r3 /* has NIP been altered? */ + cmpd r14, r15 /* has NIP been altered? */ #endif +#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_KPROBES_ON_FTRACE) + /* NIP has not been altered, skip over further checks */ + beq 1f + + /* Check if there is an active kprobe on us */ + subi r3, r14, 4 + bl is_current_kprobe_addr + nop + + /* + * If r3 == 1, then this is a kprobe/jprobe. + * else, this is livepatched function. + * + * The conditional branch for livepatch_handler below will use the + * result of this comparison. For kprobe/jprobe, we just need to branch to + * the new NIP, not call livepatch_handler. The branch below is bne, so we + * want CR0[EQ] to be true if this is a kprobe/jprobe. Which means we want + * CR0[EQ] = (r3 == 1). + */ + cmpdi r3, 1 +1: +#endif + + /* Load CTR with the possibly modified NIP */ + mtctr r15 + /* Restore gprs */ REST_GPR(0,r1) REST_10GPRS(2,r1) @@ -123,7 +149,10 @@ ftrace_call: addi r1, r1, SWITCH_FRAME_SIZE #ifdef CONFIG_LIVEPATCH - /* Based on the cmpd above, if the NIP was altered handle livepatch */ + /* + * Based on the cmpd or cmpdi above, if the NIP was altered and we're + * not on a kprobe/jprobe, then handle livepatch. + */ bne- livepatch_handler #endif From d89ba5353f301971dd7d2f9fdf25c4432728f38e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Wed, 14 Jun 2017 00:12:00 +0530 Subject: [PATCH 038/149] powerpc/64s: Handle data breakpoints in Radix mode On Power9, trying to use data breakpoints throws the splat shown below. This is because the check for a data breakpoint in DSISR is in do_hash_page(), which is not called when in Radix mode. Unable to handle kernel paging request for data at address 0xc000000000e19218 Faulting instruction address: 0xc0000000001155e8 cpu 0x0: Vector: 300 (Data Access) at [c0000000ef1e7b20] pc: c0000000001155e8: find_pid_ns+0x48/0xe0 lr: c000000000116ac4: find_task_by_vpid+0x44/0x90 sp: c0000000ef1e7da0 msr: 9000000000009033 dar: c000000000e19218 dsisr: 400000 Move the check to handle_page_fault() so as to catch data breakpoints in both Hash and Radix MMU modes. We have to change the check in do_hash_page() against 0xa410 to use 0xa450, so as to include the value of (DSISR_DABRMATCH << 16). There are two sites that call handle_page_fault() when in Radix, both already pass DSISR in r4. Fixes: caca285e5ab4 ("powerpc/mm/radix: Use STD_MMU_64 to properly isolate hash related code") Cc: stable@vger.kernel.org # v4.7+ Reported-by: Shriya R. Kulkarni Signed-off-by: Naveen N. Rao [mpe: Fix the fall-through case on hash, we need to reload DSISR] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ae418b85c17c..b886795060fd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1411,10 +1411,8 @@ USE_TEXT_SECTION() .balign IFETCH_ALIGN_BYTES do_hash_page: #ifdef CONFIG_PPC_STD_MMU_64 - andis. r0,r4,0xa410 /* weird error? */ + andis. r0,r4,0xa450 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ - andis. r0,r4,DSISR_DABRMATCH@h - bne- handle_dabr_fault CURRENT_THREAD_INFO(r11, r1) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ @@ -1438,11 +1436,16 @@ do_hash_page: /* Error */ blt- 13f + + /* Reload DSISR into r4 for the DABR check below */ + ld r4,_DSISR(r1) #endif /* CONFIG_PPC_STD_MMU_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: -11: ld r4,_DAR(r1) +11: andis. r0,r4,DSISR_DABRMATCH@h + bne- handle_dabr_fault + ld r4,_DAR(r1) ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_page_fault From bf05fc25f268cd62f147f368fe65ad3e5b04fe9f Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Thu, 15 Jun 2017 19:16:48 +0530 Subject: [PATCH 039/149] powerpc/perf: Fix oops when kthread execs user process When a kthread calls call_usermodehelper() the steps are: 1. allocate current->mm 2. load_elf_binary() 3. populate current->thread.regs While doing this, interrupts are not disabled. If there is a perf interrupt in the middle of this process (i.e. step 1 has completed but not yet reached to step 3) and if perf tries to read userspace regs, kernel oops with following log: Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xc0000000000da0fc ... Call Trace: perf_output_sample_regs+0x6c/0xd0 perf_output_sample+0x4e4/0x830 perf_event_output_forward+0x64/0x90 __perf_event_overflow+0x8c/0x1e0 record_and_restart+0x220/0x5c0 perf_event_interrupt+0x2d8/0x4d0 performance_monitor_exception+0x54/0x70 performance_monitor_common+0x158/0x160 --- interrupt: f01 at avtab_search_node+0x150/0x1a0 LR = avtab_search_node+0x100/0x1a0 ... load_elf_binary+0x6e8/0x15a0 search_binary_handler+0xe8/0x290 do_execveat_common.isra.14+0x5f4/0x840 call_usermodehelper_exec_async+0x170/0x210 ret_from_kernel_thread+0x5c/0x7c Fix it by setting abi to PERF_SAMPLE_REGS_ABI_NONE when userspace pt_regs are not set. Fixes: ed4a4ef85cf5 ("powerpc/perf: Add support for sampling interrupt register state") Cc: stable@vger.kernel.org # v4.7+ Signed-off-by: Ravi Bangoria Acked-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/perf/perf_regs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c index cbd82fde5770..09ceea6175ba 100644 --- a/arch/powerpc/perf/perf_regs.c +++ b/arch/powerpc/perf/perf_regs.c @@ -101,5 +101,6 @@ void perf_get_regs_user(struct perf_regs *regs_user, struct pt_regs *regs_user_copy) { regs_user->regs = task_pt_regs(current); - regs_user->abi = perf_reg_abi(current); + regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) : + PERF_SAMPLE_REGS_ABI_NONE; } From 57db7e4a2d92c2d3dfbca4ef8057849b2682436b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 13 Jun 2017 04:31:16 -0500 Subject: [PATCH 040/149] signal: Only reschedule timers on signals timers have sent Thomas Gleixner wrote: > The CRIU support added a 'feature' which allows a user space task to send > arbitrary (kernel) signals to itself. The changelog says: > > The kernel prevents sending of siginfo with positive si_code, because > these codes are reserved for kernel. I think we can allow a task to > send such a siginfo to itself. This operation should not be dangerous. > > Quite contrary to that claim, it turns out that it is outright dangerous > for signals with info->si_code == SI_TIMER. The following code sequence in > a user space task allows to crash the kernel: > > id = timer_create(CLOCK_XXX, ..... signo = SIGX); > timer_set(id, ....); > info->si_signo = SIGX; > info->si_code = SI_TIMER: > info->_sifields._timer._tid = id; > info->_sifields._timer._sys_private = 2; > rt_[tg]sigqueueinfo(..., SIGX, info); > sigemptyset(&sigset); > sigaddset(&sigset, SIGX); > rt_sigtimedwait(sigset, info); > > For timers based on CLOCK_PROCESS_CPUTIME_ID, CLOCK_THREAD_CPUTIME_ID this > results in a kernel crash because sigwait() dequeues the signal and the > dequeue code observes: > > info->si_code == SI_TIMER && info->_sifields._timer._sys_private != 0 > > which triggers the following callchain: > > do_schedule_next_timer() -> posix_cpu_timer_schedule() -> arm_timer() > > arm_timer() executes a list_add() on the timer, which is already armed via > the timer_set() syscall. That's a double list add which corrupts the posix > cpu timer list. As a consequence the kernel crashes on the next operation > touching the posix cpu timer list. > > Posix clocks which are internally implemented based on hrtimers are not > affected by this because hrtimer_start() can handle already armed timers > nicely, but it's a reliable way to trigger the WARN_ON() in > hrtimer_forward(), which complains about calling that function on an > already armed timer. This problem has existed since the posix timer code was merged into 2.5.63. A few releases earlier in 2.5.60 ptrace gained the ability to inject not just a signal (which linux has supported since 1.0) but the full siginfo of a signal. The core problem is that the code will reschedule in response to signals getting dequeued not just for signals the timers sent but for other signals that happen to a si_code of SI_TIMER. Avoid this confusion by testing to see if the queued signal was preallocated as all timer signals are preallocated, and so far only the timer code preallocates signals. Move the check for if a timer needs to be rescheduled up into collect_signal where the preallocation check must be performed, and pass the result back to dequeue_signal where the code reschedules timers. This makes it clear why the code cares about preallocated timers. Cc: stable@vger.kernel.org Reported-by: Thomas Gleixner History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Reference: 66dd34ad31e5 ("signal: allow to send any siginfo to itself") Reference: 1669ce53e2ff ("Add PTRACE_GETSIGINFO and PTRACE_SETSIGINFO") Fixes: db8b50ba75f2 ("[PATCH] POSIX clocks & timers") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..45b4c1ffe14e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -510,7 +510,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, + bool *resched_timer) { struct sigqueue *q, *first = NULL; @@ -532,6 +533,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); + + *resched_timer = + (first->flags & SIGQUEUE_PREALLOC) && + (info->si_code == SI_TIMER) && + (info->si_sys_private); + __sigqueue_free(first); } else { /* @@ -548,12 +555,12 @@ still_pending: } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info) + siginfo_t *info, bool *resched_timer) { int sig = next_signal(pending, mask); if (sig) - collect_signal(sig, pending, info); + collect_signal(sig, pending, info, resched_timer); return sig; } @@ -565,15 +572,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, */ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { + bool resched_timer = false; int signr; /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info); + mask, info, &resched_timer); #ifdef CONFIG_POSIX_TIMERS /* * itimer signal ? @@ -621,7 +629,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) current->jobctl |= JOBCTL_STOP_DEQUEUED; } #ifdef CONFIG_POSIX_TIMERS - if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { + if (resched_timer) { /* * Release the siglock to ensure proper locking order * of timer locks outside of siglocks. Note, we leave From 779f19ac9d5858a2c159030c0c166f7da46b74ae Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 18 Jun 2017 15:10:26 -0700 Subject: [PATCH 041/149] Input: soc_button_array - fix leaking the ACPI button descriptor buffer We are passing a buffer with ACPI_ALLOCATE_BUFFER set to acpi_evaluate_object, so we must free it when we are done with it. Signed-off-by: Hans de Goede Signed-off-by: Dmitry Torokhov --- drivers/input/misc/soc_button_array.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/input/misc/soc_button_array.c b/drivers/input/misc/soc_button_array.c index e37d37273182..f600f3a7a3c6 100644 --- a/drivers/input/misc/soc_button_array.c +++ b/drivers/input/misc/soc_button_array.c @@ -248,7 +248,8 @@ static struct soc_button_info *soc_button_get_button_info(struct device *dev) if (!btns_desc) { dev_err(dev, "ACPI Button Descriptors not found\n"); - return ERR_PTR(-ENODEV); + button_info = ERR_PTR(-ENODEV); + goto out; } /* The first package describes the collection */ @@ -264,24 +265,31 @@ static struct soc_button_info *soc_button_get_button_info(struct device *dev) } if (collection_uid == -1) { dev_err(dev, "Invalid Button Collection Descriptor\n"); - return ERR_PTR(-ENODEV); + button_info = ERR_PTR(-ENODEV); + goto out; } /* There are package.count - 1 buttons + 1 terminating empty entry */ button_info = devm_kcalloc(dev, btns_desc->package.count, sizeof(*button_info), GFP_KERNEL); - if (!button_info) - return ERR_PTR(-ENOMEM); + if (!button_info) { + button_info = ERR_PTR(-ENOMEM); + goto out; + } /* Parse the button descriptors */ for (i = 1, btn = 0; i < btns_desc->package.count; i++, btn++) { if (soc_button_parse_btn_desc(dev, &btns_desc->package.elements[i], collection_uid, - &button_info[btn])) - return ERR_PTR(-ENODEV); + &button_info[btn])) { + button_info = ERR_PTR(-ENODEV); + goto out; + } } +out: + kfree(buf.pointer); return button_info; } From a21ef715fbb8210c50b1d684145f8acdf2339596 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 15 Jun 2017 14:11:29 +0100 Subject: [PATCH 042/149] drm/i915: Differentiate between sw write location into ring and last hw read We need to keep track of the last location we ask the hw to read up to (RING_TAIL) separately from our last write location into the ring, so that in the event of a GPU reset we do not tell the HW to proceed into a partially written request (which can happen if that request is waiting for an external signal before being executed). v2: Refactor intel_ring_reset() (Mika) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100144 Testcase: igt/gem_exec_fence/await-hang Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests") Fixes: d55ac5bf97c6 ("drm/i915: Defer transfer onto execution timeline to actual hw submission") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Mika Kuoppala Link: http://patchwork.freedesktop.org/patch/msgid/20170425130049.26147-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala (cherry picked from commit e6ba9992de6c63fe86c028b4876338e1cb7dac34) Signed-off-by: Jani Nikula Link: http://patchwork.freedesktop.org/patch/msgid/20170615131129.3061-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gem_request.c | 2 +- drivers/gpu/drm/i915/i915_guc_submission.c | 4 +-- drivers/gpu/drm/i915/intel_lrc.c | 6 ++-- drivers/gpu/drm/i915/intel_ringbuffer.c | 41 ++++++++++++++-------- drivers/gpu/drm/i915/intel_ringbuffer.h | 19 ++++++++-- 5 files changed, 48 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 5ddbc9499775..a74d0ac737cb 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -623,7 +623,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine, * GPU processing the request, we never over-estimate the * position of the head. */ - req->head = req->ring->tail; + req->head = req->ring->emit; /* Check that we didn't interrupt ourselves with a new request */ GEM_BUG_ON(req->timeline->seqno != req->fence.seqno); diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c index 1642fff9cf13..ab5140ba108d 100644 --- a/drivers/gpu/drm/i915/i915_guc_submission.c +++ b/drivers/gpu/drm/i915/i915_guc_submission.c @@ -480,9 +480,7 @@ static void guc_wq_item_append(struct i915_guc_client *client, GEM_BUG_ON(freespace < wqi_size); /* The GuC firmware wants the tail index in QWords, not bytes */ - tail = rq->tail; - assert_ring_tail_valid(rq->ring, rq->tail); - tail >>= 3; + tail = intel_ring_set_tail(rq->ring, rq->tail) >> 3; GEM_BUG_ON(tail > WQ_RING_TAIL_MAX); /* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index dac4e003c1f3..62f44d3e7c43 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -326,8 +326,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq) rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt; u32 *reg_state = ce->lrc_reg_state; - assert_ring_tail_valid(rq->ring, rq->tail); - reg_state[CTX_RING_TAIL+1] = rq->tail; + reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail); /* True 32b PPGTT with dynamic page allocation: update PDP * registers and point the unallocated PDPs to scratch page. @@ -2036,8 +2035,7 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv) ce->state->obj->mm.dirty = true; i915_gem_object_unpin_map(ce->state->obj); - ce->ring->head = ce->ring->tail = 0; - intel_ring_update_space(ce->ring); + intel_ring_reset(ce->ring, 0); } } } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 66a2b8b83972..513a0f4b469b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -49,7 +49,7 @@ static int __intel_ring_space(int head, int tail, int size) void intel_ring_update_space(struct intel_ring *ring) { - ring->space = __intel_ring_space(ring->head, ring->tail, ring->size); + ring->space = __intel_ring_space(ring->head, ring->emit, ring->size); } static int @@ -774,8 +774,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request) i915_gem_request_submit(request); - assert_ring_tail_valid(request->ring, request->tail); - I915_WRITE_TAIL(request->engine, request->tail); + I915_WRITE_TAIL(request->engine, + intel_ring_set_tail(request->ring, request->tail)); } static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req, u32 *cs) @@ -1316,11 +1316,23 @@ err: return PTR_ERR(addr); } +void intel_ring_reset(struct intel_ring *ring, u32 tail) +{ + GEM_BUG_ON(!list_empty(&ring->request_list)); + ring->tail = tail; + ring->head = tail; + ring->emit = tail; + intel_ring_update_space(ring); +} + void intel_ring_unpin(struct intel_ring *ring) { GEM_BUG_ON(!ring->vma); GEM_BUG_ON(!ring->vaddr); + /* Discard any unused bytes beyond that submitted to hw. */ + intel_ring_reset(ring, ring->tail); + if (i915_vma_is_map_and_fenceable(ring->vma)) i915_vma_unpin_iomap(ring->vma); else @@ -1562,8 +1574,9 @@ void intel_legacy_submission_resume(struct drm_i915_private *dev_priv) struct intel_engine_cs *engine; enum intel_engine_id id; + /* Restart from the beginning of the rings for convenience */ for_each_engine(engine, dev_priv, id) - engine->buffer->head = engine->buffer->tail; + intel_ring_reset(engine->buffer, 0); } static int ring_request_alloc(struct drm_i915_gem_request *request) @@ -1616,7 +1629,7 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes) unsigned space; /* Would completion of this request free enough space? */ - space = __intel_ring_space(target->postfix, ring->tail, + space = __intel_ring_space(target->postfix, ring->emit, ring->size); if (space >= bytes) break; @@ -1641,8 +1654,8 @@ static int wait_for_space(struct drm_i915_gem_request *req, int bytes) u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords) { struct intel_ring *ring = req->ring; - int remain_actual = ring->size - ring->tail; - int remain_usable = ring->effective_size - ring->tail; + int remain_actual = ring->size - ring->emit; + int remain_usable = ring->effective_size - ring->emit; int bytes = num_dwords * sizeof(u32); int total_bytes, wait_bytes; bool need_wrap = false; @@ -1678,17 +1691,17 @@ u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords) if (unlikely(need_wrap)) { GEM_BUG_ON(remain_actual > ring->space); - GEM_BUG_ON(ring->tail + remain_actual > ring->size); + GEM_BUG_ON(ring->emit + remain_actual > ring->size); /* Fill the tail with MI_NOOP */ - memset(ring->vaddr + ring->tail, 0, remain_actual); - ring->tail = 0; + memset(ring->vaddr + ring->emit, 0, remain_actual); + ring->emit = 0; ring->space -= remain_actual; } - GEM_BUG_ON(ring->tail > ring->size - bytes); - cs = ring->vaddr + ring->tail; - ring->tail += bytes; + GEM_BUG_ON(ring->emit > ring->size - bytes); + cs = ring->vaddr + ring->emit; + ring->emit += bytes; ring->space -= bytes; GEM_BUG_ON(ring->space < 0); @@ -1699,7 +1712,7 @@ u32 *intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords) int intel_ring_cacheline_align(struct drm_i915_gem_request *req) { int num_dwords = - (req->ring->tail & (CACHELINE_BYTES - 1)) / sizeof(uint32_t); + (req->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(uint32_t); u32 *cs; if (num_dwords == 0) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index a82a0807f64d..f7144fe09613 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -145,6 +145,7 @@ struct intel_ring { u32 head; u32 tail; + u32 emit; int space; int size; @@ -488,6 +489,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value) struct intel_ring * intel_engine_create_ring(struct intel_engine_cs *engine, int size); int intel_ring_pin(struct intel_ring *ring, unsigned int offset_bias); +void intel_ring_reset(struct intel_ring *ring, u32 tail); +void intel_ring_update_space(struct intel_ring *ring); void intel_ring_unpin(struct intel_ring *ring); void intel_ring_free(struct intel_ring *ring); @@ -511,7 +514,7 @@ intel_ring_advance(struct drm_i915_gem_request *req, u32 *cs) * reserved for the command packet (i.e. the value passed to * intel_ring_begin()). */ - GEM_BUG_ON((req->ring->vaddr + req->ring->tail) != cs); + GEM_BUG_ON((req->ring->vaddr + req->ring->emit) != cs); } static inline u32 @@ -540,7 +543,19 @@ assert_ring_tail_valid(const struct intel_ring *ring, unsigned int tail) GEM_BUG_ON(tail >= ring->size); } -void intel_ring_update_space(struct intel_ring *ring); +static inline unsigned int +intel_ring_set_tail(struct intel_ring *ring, unsigned int tail) +{ + /* Whilst writes to the tail are strictly order, there is no + * serialisation between readers and the writers. The tail may be + * read by i915_gem_request_retire() just as it is being updated + * by execlists, as although the breadcrumb is complete, the context + * switch hasn't been seen. + */ + assert_ring_tail_valid(ring, tail); + ring->tail = tail; + return tail; +} void intel_engine_init_global_seqno(struct intel_engine_cs *engine, u32 seqno); From b8d5a9ccfba5fc084b50b00b9f5b587a8e64b72c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 9 Jun 2017 12:03:46 +0100 Subject: [PATCH 043/149] drm/i915: Encourage our shrinker more when our shmemfs allocations fails Commit 24f8e00a8a2e ("drm/i915: Prefer to report ENOMEM rather than incur the oom for gfx allocations") made the bold decision to try and avoid the oomkiller by reporting -ENOMEM to userspace if our allocation failed after attempting to free enough buffer objects. In short, it appears we were giving up too easily (even before we start wondering if one pass of reclaim is as strong as we would like). Part of the problem is that if we only shrink just enough pages for our expected allocation, the likelihood of those pages becoming available to us is less than 100% To counter-act that we ask for twice the number of pages to be made available. Furthermore, we allow the shrinker to pull pages from the active list in later passes. v2: Be a little more cautious in paging out gfx buffers, and leave that to a more balanced approach from shrink_slab(). Important when combined with "drm/i915: Start writeback from the shrinker" as anything shrunk is immediately swapped out and so should be more conservative. Fixes: 24f8e00a8a2e ("drm/i915: Prefer to report ENOMEM rather than incur the oom for gfx allocations") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Joonas Lahtinen Cc: Daniel Vetter Reviewed-by: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20170609110350.1767-1-chris@chris-wilson.co.uk (cherry picked from commit 4846bf0ca8cb4304dde6140eff33a92b3fe8ef24) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 50 +++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 462031cbd77f..c93f27b981f5 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2285,8 +2285,8 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) struct page *page; unsigned long last_pfn = 0; /* suppress gcc warning */ unsigned int max_segment; + gfp_t noreclaim; int ret; - gfp_t gfp; /* Assert that the object is not currently in any GPU domain. As it * wasn't in the GTT, there shouldn't be any way it could have been in @@ -2315,22 +2315,31 @@ rebuild_st: * Fail silently without starting the shrinker */ mapping = obj->base.filp->f_mapping; - gfp = mapping_gfp_constraint(mapping, ~(__GFP_IO | __GFP_RECLAIM)); - gfp |= __GFP_NORETRY | __GFP_NOWARN; + noreclaim = mapping_gfp_constraint(mapping, + ~(__GFP_IO | __GFP_RECLAIM)); + noreclaim |= __GFP_NORETRY | __GFP_NOWARN; + sg = st->sgl; st->nents = 0; for (i = 0; i < page_count; i++) { - page = shmem_read_mapping_page_gfp(mapping, i, gfp); - if (unlikely(IS_ERR(page))) { - i915_gem_shrink(dev_priv, - page_count, - I915_SHRINK_BOUND | - I915_SHRINK_UNBOUND | - I915_SHRINK_PURGEABLE); + const unsigned int shrink[] = { + I915_SHRINK_BOUND | I915_SHRINK_UNBOUND | I915_SHRINK_PURGEABLE, + 0, + }, *s = shrink; + gfp_t gfp = noreclaim; + + do { page = shmem_read_mapping_page_gfp(mapping, i, gfp); - } - if (unlikely(IS_ERR(page))) { - gfp_t reclaim; + if (likely(!IS_ERR(page))) + break; + + if (!*s) { + ret = PTR_ERR(page); + goto err_sg; + } + + i915_gem_shrink(dev_priv, 2 * page_count, *s++); + cond_resched(); /* We've tried hard to allocate the memory by reaping * our own buffer, now let the real VM do its job and @@ -2340,15 +2349,13 @@ rebuild_st: * defer the oom here by reporting the ENOMEM back * to userspace. */ - reclaim = mapping_gfp_mask(mapping); - reclaim |= __GFP_NORETRY; /* reclaim, but no oom */ - - page = shmem_read_mapping_page_gfp(mapping, i, reclaim); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto err_sg; + if (!*s) { + /* reclaim and warn, but no oom */ + gfp = mapping_gfp_mask(mapping); + gfp |= __GFP_NORETRY; } - } + } while (1); + if (!i || sg->length >= max_segment || page_to_pfn(page) != last_pfn + 1) { @@ -4222,6 +4229,7 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size) mapping = obj->base.filp->f_mapping; mapping_set_gfp_mask(mapping, mask); + GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM)); i915_gem_object_init(obj, &i915_gem_object_ops); From ce2c58724f7d07e76dadfeba53d6877a9e67341d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 9 Jun 2017 12:03:47 +0100 Subject: [PATCH 044/149] drm/i915: Remove __GFP_NORETRY from our buffer allocator I tried __GFP_NORETRY in the belief that __GFP_RECLAIM was effective. It struggles with handling reclaim of our dirty buffers and relies on reclaim via kswapd. As a result, a single pass of direct reclaim is unreliable when i915 occupies the majority of available memory, and the only means of effectively waiting on kswapd to amke progress is by not setting the __GFP_NORETRY flag and lopping. That leaves us with the dilemma of invoking the oomkiller instead of propagating the allocation failure back to userspace where it can be handled more gracefully (one hopes). In the future we may have __GFP_MAYFAIL to allow repeats up until we genuinely run out of memory and the oomkiller would have been invoked. Until then, let the oomkiller wreck havoc. v2: Stop playing with side-effects of gfp flags and await __GFP_MAYFAIL v3: Update comments that direct reclaim only appears to be ignoring our dirty buffers! Fixes: 24f8e00a8a2e ("drm/i915: Prefer to report ENOMEM rather than incur the oom for gfx allocations") Testcase: igt/gem_tiled_swapping Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Joonas Lahtinen Cc: Daniel Vetter Cc: Michal Hocko Link: http://patchwork.freedesktop.org/patch/msgid/20170609110350.1767-2-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen (cherry picked from commit eaf41801559a687cc7511c04dc712984765c9dd7) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c93f27b981f5..615f0a855222 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2352,7 +2352,20 @@ rebuild_st: if (!*s) { /* reclaim and warn, but no oom */ gfp = mapping_gfp_mask(mapping); - gfp |= __GFP_NORETRY; + + /* Our bo are always dirty and so we require + * kswapd to reclaim our pages (direct reclaim + * does not effectively begin pageout of our + * buffers on its own). However, direct reclaim + * only waits for kswapd when under allocation + * congestion. So as a result __GFP_RECLAIM is + * unreliable and fails to actually reclaim our + * dirty pages -- unless you try over and over + * again with !__GFP_NORETRY. However, we still + * want to fail this allocation rather than + * trigger the out-of-memory killer and for + * this we want the future __GFP_MAYFAIL. + */ } } while (1); From 17b206c27366f3cee816eaf86fafc6a11f628ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 1 Jun 2017 17:36:13 +0300 Subject: [PATCH 045/149] drm/i915: Fix deadlock witha the pipe A quirk during resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass down the correct acquire context to the pipe A quirk load detect hack during display resume. Avoids deadlocking the entire thing. Cc: stable@vger.kernel.org Cc: Maarten Lankhorst Fixes: e2c8b8701e2d ("drm/i915: Use atomic helpers for suspend, v2.") Signed-off-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/20170601143619.27840-2-ville.syrjala@linux.intel.com Reviewed-by: Maarten Lankhorst (cherry picked from commit aecd36b8a16b2302b33f49ba3fa24c955f1e32f7) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 96b0b01677e2..c27bc95d763c 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -120,7 +120,8 @@ static void intel_crtc_init_scalers(struct intel_crtc *crtc, static void skylake_pfit_enable(struct intel_crtc *crtc); static void ironlake_pfit_disable(struct intel_crtc *crtc, bool force); static void ironlake_pfit_enable(struct intel_crtc *crtc); -static void intel_modeset_setup_hw_state(struct drm_device *dev); +static void intel_modeset_setup_hw_state(struct drm_device *dev, + struct drm_modeset_acquire_ctx *ctx); static void intel_pre_disable_primary_noatomic(struct drm_crtc *crtc); struct intel_limit { @@ -3449,7 +3450,7 @@ __intel_display_resume(struct drm_device *dev, struct drm_crtc *crtc; int i, ret; - intel_modeset_setup_hw_state(dev); + intel_modeset_setup_hw_state(dev, ctx); i915_redisable_vga(to_i915(dev)); if (!state) @@ -15030,7 +15031,7 @@ int intel_modeset_init(struct drm_device *dev) intel_setup_outputs(dev_priv); drm_modeset_lock_all(dev); - intel_modeset_setup_hw_state(dev); + intel_modeset_setup_hw_state(dev, dev->mode_config.acquire_ctx); drm_modeset_unlock_all(dev); for_each_intel_crtc(dev, crtc) { @@ -15067,13 +15068,13 @@ int intel_modeset_init(struct drm_device *dev) return 0; } -static void intel_enable_pipe_a(struct drm_device *dev) +static void intel_enable_pipe_a(struct drm_device *dev, + struct drm_modeset_acquire_ctx *ctx) { struct intel_connector *connector; struct drm_connector_list_iter conn_iter; struct drm_connector *crt = NULL; struct intel_load_detect_pipe load_detect_temp; - struct drm_modeset_acquire_ctx *ctx = dev->mode_config.acquire_ctx; int ret; /* We can't just switch on the pipe A, we need to set things up with a @@ -15145,7 +15146,8 @@ static bool has_pch_trancoder(struct drm_i915_private *dev_priv, (HAS_PCH_LPT_H(dev_priv) && pch_transcoder == TRANSCODER_A); } -static void intel_sanitize_crtc(struct intel_crtc *crtc) +static void intel_sanitize_crtc(struct intel_crtc *crtc, + struct drm_modeset_acquire_ctx *ctx) { struct drm_device *dev = crtc->base.dev; struct drm_i915_private *dev_priv = to_i915(dev); @@ -15201,7 +15203,7 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc) * resume. Force-enable the pipe to fix this, the update_dpms * call below we restore the pipe to the right state, but leave * the required bits on. */ - intel_enable_pipe_a(dev); + intel_enable_pipe_a(dev, ctx); } /* Adjust the state of the output pipe according to whether we @@ -15505,7 +15507,8 @@ get_encoder_power_domains(struct drm_i915_private *dev_priv) * and sanitizes it to the current state */ static void -intel_modeset_setup_hw_state(struct drm_device *dev) +intel_modeset_setup_hw_state(struct drm_device *dev, + struct drm_modeset_acquire_ctx *ctx) { struct drm_i915_private *dev_priv = to_i915(dev); enum pipe pipe; @@ -15525,7 +15528,7 @@ intel_modeset_setup_hw_state(struct drm_device *dev) for_each_pipe(dev_priv, pipe) { crtc = intel_get_crtc_for_pipe(dev_priv, pipe); - intel_sanitize_crtc(crtc); + intel_sanitize_crtc(crtc, ctx); intel_dump_pipe_config(crtc, crtc->config, "[setup_hw_state]"); } From b7f5dd36e0c5cb9ca1070a5e0f22f666bcff07ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 1 Jun 2017 17:36:14 +0300 Subject: [PATCH 046/149] drm/i915: Plumb the correct acquire ctx into intel_crtc_disable_noatomic() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If intel_crtc_disable_noatomic() were to ever get called during resume we'd end up deadlocking since resume has its own acqcuire_ctx but intel_crtc_disable_noatomic() still tries to use the mode_config.acquire_ctx. Pass down the correct acquire ctx from the top. Cc: stable@vger.kernel.org Cc: Maarten Lankhorst Fixes: e2c8b8701e2d ("drm/i915: Use atomic helpers for suspend, v2.") Signed-off-by: Ville Syrjälä Link: http://patchwork.freedesktop.org/patch/msgid/20170601143619.27840-3-ville.syrjala@linux.intel.com Reviewed-by: Maarten Lankhorst (cherry picked from commit da1d0e265535634bba80d44510b864c620549bee) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index c27bc95d763c..9106ea32b048 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -5826,7 +5826,8 @@ static void i9xx_crtc_disable(struct intel_crtc_state *old_crtc_state, intel_update_watermarks(intel_crtc); } -static void intel_crtc_disable_noatomic(struct drm_crtc *crtc) +static void intel_crtc_disable_noatomic(struct drm_crtc *crtc, + struct drm_modeset_acquire_ctx *ctx) { struct intel_encoder *encoder; struct intel_crtc *intel_crtc = to_intel_crtc(crtc); @@ -5856,7 +5857,7 @@ static void intel_crtc_disable_noatomic(struct drm_crtc *crtc) return; } - state->acquire_ctx = crtc->dev->mode_config.acquire_ctx; + state->acquire_ctx = ctx; /* Everything's already locked, -EDEADLK can't happen. */ crtc_state = intel_atomic_get_crtc_state(state, intel_crtc); @@ -15193,7 +15194,7 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc, plane = crtc->plane; crtc->base.primary->state->visible = true; crtc->plane = !plane; - intel_crtc_disable_noatomic(&crtc->base); + intel_crtc_disable_noatomic(&crtc->base, ctx); crtc->plane = plane; } @@ -15209,7 +15210,7 @@ static void intel_sanitize_crtc(struct intel_crtc *crtc, /* Adjust the state of the output pipe according to whether we * have active connectors/encoders. */ if (crtc->active && !intel_crtc_has_encoders(crtc)) - intel_crtc_disable_noatomic(&crtc->base); + intel_crtc_disable_noatomic(&crtc->base, ctx); if (crtc->active || HAS_GMCH_DISPLAY(dev_priv)) { /* From 4a9bfafc64f44ef83de4e00ca1b57352af6cd8c2 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 11 Jun 2017 16:08:21 +0900 Subject: [PATCH 047/149] ALSA: firewire-lib: Fix stall of process context at packet error At Linux v3.5, packet processing can be done in process context of ALSA PCM application as well as software IRQ context for OHCI 1394. Below is an example of the callgraph (some calls are omitted). ioctl(2) with e.g. HWSYNC (sound/core/pcm_native.c) ->snd_pcm_common_ioctl1() ->snd_pcm_hwsync() ->snd_pcm_stream_lock_irq (sound/core/pcm_lib.c) ->snd_pcm_update_hw_ptr() ->snd_pcm_udpate_hw_ptr0() ->struct snd_pcm_ops.pointer() (sound/firewire/*) = Each handler on drivers in ALSA firewire stack (sound/firewire/amdtp-stream.c) ->amdtp_stream_pcm_pointer() (drivers/firewire/core-iso.c) ->fw_iso_context_flush_completions() ->struct fw_card_driver.flush_iso_completion() (drivers/firewire/ohci.c) = flush_iso_completions() ->struct fw_iso_context.callback.sc (sound/firewire/amdtp-stream.c) = in_stream_callback() or out_stream_callback() ->... ->snd_pcm_stream_unlock_irq When packet queueing error occurs or detecting invalid packets in 'in_stream_callback()' or 'out_stream_callback()', 'snd_pcm_stop_xrun()' is called on local CPU with disabled IRQ. (sound/firewire/amdtp-stream.c) in_stream_callback() or out_stream_callback() ->amdtp_stream_pcm_abort() ->snd_pcm_stop_xrun() ->snd_pcm_stream_lock_irqsave() ->snd_pcm_stop() ->snd_pcm_stream_unlock_irqrestore() The process is stalled on the CPU due to attempt to acquire recursive lock. [ 562.630853] INFO: rcu_sched detected stalls on CPUs/tasks: [ 562.630861] 2-...: (1 GPs behind) idle=37d/140000000000000/0 softirq=38323/38323 fqs=7140 [ 562.630862] (detected by 3, t=15002 jiffies, g=21036, c=21035, q=5933) [ 562.630866] Task dump for CPU 2: [ 562.630867] alsa-source-OXF R running task 0 6619 1 0x00000008 [ 562.630870] Call Trace: [ 562.630876] ? vt_console_print+0x79/0x3e0 [ 562.630880] ? msg_print_text+0x9d/0x100 [ 562.630883] ? up+0x32/0x50 [ 562.630885] ? irq_work_queue+0x8d/0xa0 [ 562.630886] ? console_unlock+0x2b6/0x4b0 [ 562.630888] ? vprintk_emit+0x312/0x4a0 [ 562.630892] ? dev_vprintk_emit+0xbf/0x230 [ 562.630895] ? do_sys_poll+0x37a/0x550 [ 562.630897] ? dev_printk_emit+0x4e/0x70 [ 562.630900] ? __dev_printk+0x3c/0x80 [ 562.630903] ? _raw_spin_lock+0x20/0x30 [ 562.630909] ? snd_pcm_stream_lock+0x31/0x50 [snd_pcm] [ 562.630914] ? _snd_pcm_stream_lock_irqsave+0x2e/0x40 [snd_pcm] [ 562.630918] ? snd_pcm_stop_xrun+0x16/0x70 [snd_pcm] [ 562.630922] ? in_stream_callback+0x3e6/0x450 [snd_firewire_lib] [ 562.630925] ? handle_ir_packet_per_buffer+0x8e/0x1a0 [firewire_ohci] [ 562.630928] ? ohci_flush_iso_completions+0xa3/0x130 [firewire_ohci] [ 562.630932] ? fw_iso_context_flush_completions+0x15/0x20 [firewire_core] [ 562.630935] ? amdtp_stream_pcm_pointer+0x2d/0x40 [snd_firewire_lib] [ 562.630938] ? pcm_capture_pointer+0x19/0x20 [snd_oxfw] [ 562.630943] ? snd_pcm_update_hw_ptr0+0x47/0x3d0 [snd_pcm] [ 562.630945] ? poll_select_copy_remaining+0x150/0x150 [ 562.630947] ? poll_select_copy_remaining+0x150/0x150 [ 562.630952] ? snd_pcm_update_hw_ptr+0x10/0x20 [snd_pcm] [ 562.630956] ? snd_pcm_hwsync+0x45/0xb0 [snd_pcm] [ 562.630960] ? snd_pcm_common_ioctl1+0x1ff/0xc90 [snd_pcm] [ 562.630962] ? futex_wake+0x90/0x170 [ 562.630966] ? snd_pcm_capture_ioctl1+0x136/0x260 [snd_pcm] [ 562.630970] ? snd_pcm_capture_ioctl+0x27/0x40 [snd_pcm] [ 562.630972] ? do_vfs_ioctl+0xa3/0x610 [ 562.630974] ? vfs_read+0x11b/0x130 [ 562.630976] ? SyS_ioctl+0x79/0x90 [ 562.630978] ? entry_SYSCALL_64_fastpath+0x1e/0xad This commit fixes the above bug. This assumes two cases: 1. Any error is detected in software IRQ context of OHCI 1394 context. In this case, PCM substream should be aborted in packet handler. On the other hand, it should not be done in any process context. TO distinguish these two context, use 'in_interrupt()' macro. 2. Any error is detect in process context of ALSA PCM application. In this case, PCM substream should not be aborted in packet handler because PCM substream lock is acquired. The task to abort PCM substream should be done in ALSA PCM core. For this purpose, SNDRV_PCM_POS_XRUN is returned at 'struct snd_pcm_ops.pointer()'. Suggested-by: Clemens Ladisch Fixes: e9148dddc3c7("ALSA: firewire-lib: flush completed packets when reading PCM position") Cc: # 4.9+ Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai --- sound/firewire/amdtp-stream.c | 8 ++++++-- sound/firewire/amdtp-stream.h | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c index 9e6f54f8c45d..1e26854b3425 100644 --- a/sound/firewire/amdtp-stream.c +++ b/sound/firewire/amdtp-stream.c @@ -682,7 +682,9 @@ static void out_stream_callback(struct fw_iso_context *context, u32 tstamp, cycle = increment_cycle_count(cycle, 1); if (s->handle_packet(s, 0, cycle, i) < 0) { s->packet_index = -1; - amdtp_stream_pcm_abort(s); + if (in_interrupt()) + amdtp_stream_pcm_abort(s); + WRITE_ONCE(s->pcm_buffer_pointer, SNDRV_PCM_POS_XRUN); return; } } @@ -734,7 +736,9 @@ static void in_stream_callback(struct fw_iso_context *context, u32 tstamp, /* Queueing error or detecting invalid payload. */ if (i < packets) { s->packet_index = -1; - amdtp_stream_pcm_abort(s); + if (in_interrupt()) + amdtp_stream_pcm_abort(s); + WRITE_ONCE(s->pcm_buffer_pointer, SNDRV_PCM_POS_XRUN); return; } diff --git a/sound/firewire/amdtp-stream.h b/sound/firewire/amdtp-stream.h index 7e8831722821..ea1a91e99875 100644 --- a/sound/firewire/amdtp-stream.h +++ b/sound/firewire/amdtp-stream.h @@ -135,7 +135,7 @@ struct amdtp_stream { /* For a PCM substream processing. */ struct snd_pcm_substream *pcm; struct tasklet_struct period_tasklet; - unsigned int pcm_buffer_pointer; + snd_pcm_uframes_t pcm_buffer_pointer; unsigned int pcm_period_pointer; /* To wait for first packet. */ From a8ae0a773d38b4b1d4566b0edcb6bb63f4a9d22f Mon Sep 17 00:00:00 2001 From: Dhinakaran Pandiyan Date: Mon, 19 Jun 2017 11:08:28 -0700 Subject: [PATCH 048/149] drm/i915: Don't enable backlight at setup time. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maarten and Ville noticed that we are enabling backlight via DP aux very early in the modeset_init path via the intel_dp_aux_setup_backlight() function, since commit e7156c833903 ("drm/i915: Add Backlight Control using DPCD for eDP connectors (v9)"). Looks like all we need to do during _setup_backlight() is read the current brightness state instead of modifying it. v2: Rewrote commit message. Cc: Ville Syrjala Cc: Maarten Lankhorst Cc: Jani Nikula Cc: Yetunde Adebisi Signed-off-by: Dhinakaran Pandiyan Reviewed-by: Maarten Lankhorst Acked-by: Jani Nikula Tested-by: Puthikorn Voravootivat Fixes: e7156c833903 ("drm/i915: Add Backlight Control using DPCD for eDP connectors (v9)") Link: http://patchwork.freedesktop.org/patch/msgid/1497384239-2965-1-git-send-email-dhinakaran.pandiyan@intel.com Signed-off-by: Ville Syrjälä (cherry picked from commit f6262bda462e81e959b80a96dac799bd9df27f73) Signed-off-by: Jani Nikula Link: http://patchwork.freedesktop.org/patch/msgid/1497895708-19422-1-git-send-email-dhinakaran.pandiyan@intel.com --- drivers/gpu/drm/i915/intel_dp_aux_backlight.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_dp_aux_backlight.c b/drivers/gpu/drm/i915/intel_dp_aux_backlight.c index 6532e226db29..40ba3134545e 100644 --- a/drivers/gpu/drm/i915/intel_dp_aux_backlight.c +++ b/drivers/gpu/drm/i915/intel_dp_aux_backlight.c @@ -119,8 +119,6 @@ static int intel_dp_aux_setup_backlight(struct intel_connector *connector, struct intel_dp *intel_dp = enc_to_intel_dp(&connector->encoder->base); struct intel_panel *panel = &connector->panel; - intel_dp_aux_enable_backlight(connector); - if (intel_dp->edp_dpcd[2] & DP_EDP_BACKLIGHT_BRIGHTNESS_BYTE_COUNT) panel->backlight.max = 0xFFFF; else From 6e88491cf2a3b17199c78bd53348b39dc6a88275 Mon Sep 17 00:00:00 2001 From: Junshan Fang Date: Thu, 15 Jun 2017 14:02:20 +0800 Subject: [PATCH 049/149] drm/amdgpu: add Polaris12 DID Signed-off-by: Junshan Fang Reviewed-by: Roger.He Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index f2d705e6a75a..ab6b0d0febab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -449,6 +449,7 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x6986, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x6987, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x6995, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, + {0x1002, 0x6997, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, {0x1002, 0x699F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_POLARIS12}, /* Vega 10 */ {0x1002, 0x6860, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VEGA10|AMD_EXP_HW_SUPPORT}, From 4a072c71f49b0a0e495ea13423bdb850da73c58c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 15 Jun 2017 00:45:26 +0200 Subject: [PATCH 050/149] random: silence compiler warnings and fix race Odd versions of gcc for the sh4 architecture will actually warn about flags being used while uninitialized, so we set them to zero. Non crazy gccs will optimize that out again, so it doesn't make a difference. Next, over aggressive gccs could inline the expression that defines use_lock, which could then introduce a race resulting in a lock imbalance. By using READ_ONCE, we prevent that fate. Finally, we make that assignment const, so that gcc can still optimize a nice amount. Finally, we fix a potential deadlock between primary_crng.lock and batched_entropy_reset_lock, where they could be called in opposite order. Moving the call to invalidate_batched_entropy to outside the lock rectifies this issue. Fixes: b169c13de473a85b3c859bb36216a4cb5f00a54a Signed-off-by: Jason A. Donenfeld Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- drivers/char/random.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index e870f329db88..01a260f67437 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -803,13 +803,13 @@ static int crng_fast_load(const char *cp, size_t len) p[crng_init_cnt % CHACHA20_KEY_SIZE] ^= *cp; cp++; crng_init_cnt++; len--; } + spin_unlock_irqrestore(&primary_crng.lock, flags); if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) { invalidate_batched_entropy(); crng_init = 1; wake_up_interruptible(&crng_init_wait); pr_notice("random: fast init done\n"); } - spin_unlock_irqrestore(&primary_crng.lock, flags); return 1; } @@ -841,6 +841,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) } memzero_explicit(&buf, sizeof(buf)); crng->init_time = jiffies; + spin_unlock_irqrestore(&primary_crng.lock, flags); if (crng == &primary_crng && crng_init < 2) { invalidate_batched_entropy(); crng_init = 2; @@ -848,7 +849,6 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) wake_up_interruptible(&crng_init_wait); pr_notice("random: crng init done\n"); } - spin_unlock_irqrestore(&primary_crng.lock, flags); } static inline void crng_wait_ready(void) @@ -2041,8 +2041,8 @@ static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64); u64 get_random_u64(void) { u64 ret; - bool use_lock = crng_init < 2; - unsigned long flags; + bool use_lock = READ_ONCE(crng_init) < 2; + unsigned long flags = 0; struct batched_entropy *batch; #if BITS_PER_LONG == 64 @@ -2073,8 +2073,8 @@ static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32); u32 get_random_u32(void) { u32 ret; - bool use_lock = crng_init < 2; - unsigned long flags; + bool use_lock = READ_ONCE(crng_init) < 2; + unsigned long flags = 0; struct batched_entropy *batch; if (arch_get_random_int(&ret)) From 6ebf81536d3be327c4f5f59bae3b841d62322343 Mon Sep 17 00:00:00 2001 From: Manish Rangankar Date: Thu, 15 Jun 2017 00:10:39 -0700 Subject: [PATCH 051/149] scsi: qedi: Remove WARN_ON for untracked cleanup. Signed-off-by: Manish Rangankar Reviewed-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/qedi/qedi_fw.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c index 8bc7ee1a8ca8..507512cc478b 100644 --- a/drivers/scsi/qedi/qedi_fw.c +++ b/drivers/scsi/qedi/qedi_fw.c @@ -870,7 +870,6 @@ static void qedi_process_cmd_cleanup_resp(struct qedi_ctx *qedi, QEDI_ERR(&qedi->dbg_ctx, "Delayed or untracked cleanup response, itt=0x%x, tid=0x%x, cid=0x%x, task=%p\n", protoitt, cqe->itid, qedi_conn->iscsi_conn_id, task); - WARN_ON(1); } } From 02d94e04747c5df55410c7b19f3cf72a1a11899b Mon Sep 17 00:00:00 2001 From: Manish Rangankar Date: Thu, 15 Jun 2017 00:10:40 -0700 Subject: [PATCH 052/149] scsi: qedi: Remove WARN_ON from clear task context. Signed-off-by: Manish Rangankar Reviewed-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/qedi/qedi_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index 09a294634bc7..879d3b7462f9 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -1499,11 +1499,9 @@ err_idx: void qedi_clear_task_idx(struct qedi_ctx *qedi, int idx) { - if (!test_and_clear_bit(idx, qedi->task_idx_map)) { + if (!test_and_clear_bit(idx, qedi->task_idx_map)) QEDI_ERR(&qedi->dbg_ctx, "FW task context, already cleared, tid=0x%x\n", idx); - WARN_ON(1); - } } void qedi_update_itt_map(struct qedi_ctx *qedi, u32 tid, u32 proto_itt, From 817ae460c784f32cd45e60b2b1b21378c3c6a847 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Mon, 19 Jun 2017 19:48:52 -0700 Subject: [PATCH 053/149] Input: i8042 - add Fujitsu Lifebook AH544 to notimeout list Without this quirk, the touchpad is not responsive on this product, with the following message repeated in the logs: psmouse serio1: bad data from KBC - timeout Add it to the notimeout list alongside other similar Fujitsu laptops. Signed-off-by: Daniel Drake Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-x86ia64io.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h index 09720d950686..f932a83b4990 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-x86ia64io.h @@ -723,6 +723,13 @@ static const struct dmi_system_id __initconst i8042_dmi_notimeout_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK U574"), }, }, + { + /* Fujitsu UH554 laptop */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK UH544"), + }, + }, { } }; From c7ecb9068e6772c43941ce609f08bc53f36e1dce Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 14 Jun 2017 07:37:14 +0200 Subject: [PATCH 054/149] ALSA: hda - Apply quirks to Broxton-T, too Broxton-T was a forgotten child and we didn't apply the quirks for Skylake+ properly. Meanwhile, a quirk for reducing the DMA latency seems specific to the early Broxton model, so we leave as is. Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e3c696c46a21..01eb1dc7b5b3 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -370,11 +370,12 @@ enum { #define IS_KBL_LP(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x9d71) #define IS_KBL_H(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa2f0) #define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98) +#define IS_BXT_T(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x1a98) #define IS_GLK(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x3198) #define IS_CFL(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa348) -#define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci)) || \ - IS_KBL(pci) || IS_KBL_LP(pci) || IS_KBL_H(pci) || \ - IS_GLK(pci) || IS_CFL(pci) +#define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci) || \ + IS_BXT_T(pci) || IS_KBL(pci) || IS_KBL_LP(pci) || \ + IS_KBL_H(pci) || IS_GLK(pci) || IS_CFL(pci)) static char *driver_short_names[] = { [AZX_DRIVER_ICH] = "HDA Intel", From ceea5e3771ed2378668455fa21861bead7504df5 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Jun 2017 16:44:20 -0700 Subject: [PATCH 055/149] time: Fix clock->read(clock) race around clocksource changes In tests, which excercise switching of clocksources, a NULL pointer dereference can be observed on AMR64 platforms in the clocksource read() function: u64 clocksource_mmio_readl_down(struct clocksource *c) { return ~(u64)readl_relaxed(to_mmio_clksrc(c)->reg) & c->mask; } This is called from the core timekeeping code via: cycle_now = tkr->read(tkr->clock); tkr->read is the cached tkr->clock->read() function pointer. When the clocksource is changed then tkr->clock and tkr->read are updated sequentially. The code above results in a sequential load operation of tkr->read and tkr->clock as well. If the store to tkr->clock hits between the loads of tkr->read and tkr->clock, then the old read() function is called with the new clock pointer. As a consequence the read() function dereferences a different data structure and the resulting 'reg' pointer can point anywhere including NULL. This problem was introduced when the timekeeping code was switched over to use struct tk_read_base. Before that, it was theoretically possible as well when the compiler decided to reload clock in the code sequence: now = tk->clock->read(tk->clock); Add a helper function which avoids the issue by reading tk_read_base->clock once into a local variable clk and then issue the read function via clk->read(clk). This guarantees that the read() function always gets the proper clocksource pointer handed in. Since there is now no use for the tkr.read pointer, this patch also removes it, and to address stopping the fast timekeeper during suspend/resume, it introduces a dummy clocksource to use rather then just a dummy read function. Signed-off-by: John Stultz Acked-by: Ingo Molnar Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: stable Cc: Miroslav Lichvar Cc: Daniel Mentz Link: http://lkml.kernel.org/r/1496965462-20003-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/timekeeper_internal.h | 1 - kernel/time/timekeeping.c | 52 ++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 110f4532188c..e9834ada4d0c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -29,7 +29,6 @@ */ struct tk_read_base { struct clocksource *clock; - u64 (*read)(struct clocksource *cs); u64 mask; u64 cycle_last; u32 mult; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9652bc57fd09..eff94cb8e89e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,6 +118,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->offs_boot = ktime_add(tk->offs_boot, delta); } +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqlock ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function. This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} + #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ @@ -175,7 +195,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) */ do { seq = read_seqcount_begin(&tk_core.seq); - now = tkr->read(tkr->clock); + now = tk_clock_read(tkr); last = tkr->cycle_last; mask = tkr->mask; max = tkr->clock->max_cycles; @@ -209,7 +229,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) u64 cycle_now, delta; /* read clocksource */ - cycle_now = tkr->read(tkr->clock); + cycle_now = tk_clock_read(tkr); /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); @@ -238,12 +258,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; - tk->tkr_mono.read = clock->read; tk->tkr_mono.mask = clock->mask; - tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; - tk->tkr_raw.read = clock->read; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; @@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) now += timekeeping_delta_to_ns(tkr, clocksource_delta( - tkr->read(tkr->clock), + tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); @@ -461,6 +479,10 @@ static u64 dummy_clock_read(struct clocksource *cs) return cycles_at_suspend; } +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk) struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - cycles_at_suspend = tkr->read(tkr->clock); - tkr_dummy.read = dummy_clock_read; + cycles_at_suspend = tk_clock_read(tkr); + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - tkr_dummy.read = dummy_clock_read; + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } @@ -649,11 +671,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr_mono.clock; u64 cycle_now, delta; u64 nsec; - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -929,8 +950,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) do { seq = read_seqcount_begin(&tk_core.seq); - - now = tk->tkr_mono.read(tk->tkr_mono.clock); + now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, @@ -1108,7 +1128,7 @@ int get_device_system_crosststamp(int (*get_time_fn) * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ - now = tk->tkr_mono.read(tk->tkr_mono.clock); + now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!cycle_between(interval_start, cycles, now)) { clock_was_set_seq = tk->clock_was_set_seq; @@ -1629,7 +1649,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { u64 nsec, cyc_delta; @@ -2030,7 +2050,7 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif From 3d88d56c5873f6eebe23e05c3da701960146b801 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Jun 2017 16:44:21 -0700 Subject: [PATCH 056/149] time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting Due to how the MONOTONIC_RAW accumulation logic was handled, there is the potential for a 1ns discontinuity when we do accumulations. This small discontinuity has for the most part gone un-noticed, but since ARM64 enabled CLOCK_MONOTONIC_RAW in their vDSO clock_gettime implementation, we've seen failures with the inconsistency-check test in kselftest. This patch addresses the issue by using the same sub-ns accumulation handling that CLOCK_MONOTONIC uses, which avoids the issue for in-kernel users. Since the ARM64 vDSO implementation has its own clock_gettime calculation logic, this patch reduces the frequency of errors, but failures are still seen. The ARM64 vDSO will need to be updated to include the sub-nanosecond xtime_nsec values in its calculation for this issue to be completely fixed. Signed-off-by: John Stultz Tested-by: Daniel Mentz Cc: Prarit Bhargava Cc: Kevin Brodsky Cc: Richard Cochran Cc: Stephen Boyd Cc: Will Deacon Cc: "stable #4 . 8+" Cc: Miroslav Lichvar Link: http://lkml.kernel.org/r/1496965462-20003-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/timekeeper_internal.h | 4 ++-- kernel/time/timekeeping.c | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index e9834ada4d0c..f7043ccca81c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -57,7 +57,7 @@ struct tk_read_base { * interval. * @xtime_remainder: Shifted nano seconds left over when rounding * @cycle_interval - * @raw_interval: Raw nano seconds accumulated per NTP interval. + * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. * @ntp_error: Difference between accumulated time and NTP time in ntp * shifted nano seconds. * @ntp_error_shift: Shift conversion between clock shifted nano seconds and @@ -99,7 +99,7 @@ struct timekeeper { u64 cycle_interval; u64 xtime_interval; s64 xtime_remainder; - u32 raw_interval; + u64 raw_interval; /* The ntp_tick_length() value currently being used. * This cached copy ensures we consistently apply the tick * length for an entire tick, as ntp_tick_length may change diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index eff94cb8e89e..b602c48cb841 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -280,7 +280,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = (interval * clock->mult) >> clock->shift; + tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { @@ -1996,7 +1996,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; - u64 raw_nsecs; + u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) @@ -2011,14 +2011,15 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = (u64)tk->raw_interval << shift; - raw_nsecs += tk->raw_time.tv_nsec; - if (raw_nsecs >= NSEC_PER_SEC) { - u64 raw_secs = raw_nsecs; - raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - tk->raw_time.tv_sec += raw_secs; + tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; + snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { + tk->tkr_raw.xtime_nsec -= snsec_per_sec; + tk->raw_time.tv_sec++; } - tk->raw_time.tv_nsec = raw_nsecs; + tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; From dbb236c1ceb697a559e0694ac4c9e7b9131d0b16 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 8 Jun 2017 16:44:22 -0700 Subject: [PATCH 057/149] arm64/vdso: Fix nsec handling for CLOCK_MONOTONIC_RAW Recently vDSO support for CLOCK_MONOTONIC_RAW was added in 49eea433b326 ("arm64: Add support for CLOCK_MONOTONIC_RAW in clock_gettime() vDSO"). Noticing that the core timekeeping code never set tkr_raw.xtime_nsec, the vDSO implementation didn't bother exposing it via the data page and instead took the unshifted tk->raw_time.tv_nsec value which was then immediately shifted left in the vDSO code. Unfortunately, by accellerating the MONOTONIC_RAW clockid, it uncovered potential 1ns time inconsistencies caused by the timekeeping core not handing sub-ns resolution. Now that the core code has been fixed and is actually setting tkr_raw.xtime_nsec, we need to take that into account in the vDSO by adding it to the shifted raw_time value, in order to fix the user-visible inconsistency. Rather than do that at each use (and expand the data page in the process), instead perform the shift/addition operation when populating the data page and remove the shift from the vDSO code entirely. [jstultz: minor whitespace tweak, tried to improve commit message to make it more clear this fixes a regression] Reported-by: John Stultz Signed-off-by: Will Deacon Signed-off-by: John Stultz Tested-by: Daniel Mentz Acked-by: Kevin Brodsky Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: "stable #4 . 8+" Cc: Miroslav Lichvar Link: http://lkml.kernel.org/r/1496965462-20003-4-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- arch/arm64/kernel/vdso.c | 5 +++-- arch/arm64/kernel/vdso/gettimeofday.S | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 41b6e31f8f55..d0cb007fa482 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -221,10 +221,11 @@ void update_vsyscall(struct timekeeper *tk) /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; vdso_data->raw_time_sec = tk->raw_time.tv_sec; - vdso_data->raw_time_nsec = tk->raw_time.tv_nsec; + vdso_data->raw_time_nsec = (tk->raw_time.tv_nsec << + tk->tkr_raw.shift) + + tk->tkr_raw.xtime_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - /* tkr_raw.xtime_nsec == 0 */ vdso_data->cs_mono_mult = tk->tkr_mono.mult; vdso_data->cs_raw_mult = tk->tkr_raw.mult; /* tkr_mono.shift == tkr_raw.shift */ diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index e00b4671bd7c..76320e920965 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -256,7 +256,6 @@ monotonic_raw: seqcnt_check fail=monotonic_raw /* All computations are done with left-shifted nsecs. */ - lsl x14, x14, x12 get_nsec_per_sec res=x9 lsl x9, x9, x12 From 6c7515c61ffa0985c57abd8892c7928b52b9a306 Mon Sep 17 00:00:00 2001 From: Ralph Sennhauser Date: Thu, 1 Jun 2017 22:08:20 +0200 Subject: [PATCH 058/149] gpio: mvebu: change compatible string for PWM support As it turns out more than just Armada 370 and XP support using GPIO lines as PWM lines. For example the Armada 38x family has the same hardware support. As such "marvell,armada-370-xp-gpio" for the compatible string is a misnomer. Change the compatible string to "marvell,armada-370-gpio" before the driver makes it out of the -rc stage. This also follows the practice of using only the first device family supported as part of the name. Also update the documentation and comments in the code accordingly. Fixes: 757642f9a584 ("gpio: mvebu: Add limited PWM support") Signed-off-by: Ralph Sennhauser Acked-by: Gregory CLEMENT Acked-by: Rob Herring Signed-off-by: Linus Walleij --- Documentation/devicetree/bindings/gpio/gpio-mvebu.txt | 6 +++--- drivers/gpio/gpio-mvebu.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt index 42c3bb2d53e8..01e331a5f3e7 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt @@ -41,9 +41,9 @@ Required properties: Optional properties: In order to use the GPIO lines in PWM mode, some additional optional -properties are required. Only Armada 370 and XP support these properties. +properties are required. -- compatible: Must contain "marvell,armada-370-xp-gpio" +- compatible: Must contain "marvell,armada-370-gpio" - reg: an additional register set is needed, for the GPIO Blink Counter on/off registers. @@ -71,7 +71,7 @@ Example: }; gpio1: gpio@18140 { - compatible = "marvell,armada-370-xp-gpio"; + compatible = "marvell,armada-370-gpio"; reg = <0x18140 0x40>, <0x181c8 0x08>; reg-names = "gpio", "pwm"; ngpios = <17>; diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c index 5104b6398139..c83ea68be792 100644 --- a/drivers/gpio/gpio-mvebu.c +++ b/drivers/gpio/gpio-mvebu.c @@ -721,7 +721,7 @@ static int mvebu_pwm_probe(struct platform_device *pdev, u32 set; if (!of_device_is_compatible(mvchip->chip.of_node, - "marvell,armada-370-xp-gpio")) + "marvell,armada-370-gpio")) return 0; if (IS_ERR(mvchip->clk)) @@ -852,7 +852,7 @@ static const struct of_device_id mvebu_gpio_of_match[] = { .data = (void *) MVEBU_GPIO_SOC_VARIANT_ARMADAXP, }, { - .compatible = "marvell,armada-370-xp-gpio", + .compatible = "marvell,armada-370-gpio", .data = (void *) MVEBU_GPIO_SOC_VARIANT_ORION, }, { @@ -1128,7 +1128,7 @@ static int mvebu_gpio_probe(struct platform_device *pdev) mvchip); } - /* Armada 370/XP has simple PWM support for GPIO lines */ + /* Some MVEBU SoCs have simple PWM support for GPIO lines */ if (IS_ENABLED(CONFIG_PWM)) return mvebu_pwm_probe(pdev, mvchip, id); From e27a9eca5d4a392b96ce5d5238c8d637bcb0a52c Mon Sep 17 00:00:00 2001 From: James Cowgill Date: Tue, 20 Jun 2017 10:57:51 +0100 Subject: [PATCH 059/149] KVM: MIPS: Fix maybe-uninitialized build failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes a "maybe-uninitialized" build failure in arch/mips/kvm/tlb.c when KVM, DYNAMIC_DEBUG and JUMP_LABEL are all enabled. The failure is: In file included from ./include/linux/printk.h:329:0, from ./include/linux/kernel.h:13, from ./include/asm-generic/bug.h:15, from ./arch/mips/include/asm/bug.h:41, from ./include/linux/bug.h:4, from ./include/linux/thread_info.h:11, from ./include/asm-generic/current.h:4, from ./arch/mips/include/generated/asm/current.h:1, from ./include/linux/sched.h:11, from arch/mips/kvm/tlb.c:13: arch/mips/kvm/tlb.c: In function ‘kvm_mips_host_tlb_inv’: ./include/linux/dynamic_debug.h:126:3: error: ‘idx_kernel’ may be used uninitialized in this function [-Werror=maybe-uninitialized] __dynamic_pr_debug(&descriptor, pr_fmt(fmt), \ ^~~~~~~~~~~~~~~~~~ arch/mips/kvm/tlb.c:169:16: note: ‘idx_kernel’ was declared here int idx_user, idx_kernel; ^~~~~~~~~~ There is a similar error relating to "idx_user". Both errors were observed with GCC 6. As far as I can tell, it is impossible for either idx_user or idx_kernel to be uninitialized when they are later read in the calls to kvm_debug, but to satisfy the compiler, add zero initializers to both variables. Signed-off-by: James Cowgill Fixes: 57e3869cfaae ("KVM: MIPS/TLB: Generalise host TLB invalidate to kernel ASID") Cc: # 4.11+ Acked-by: James Hogan Signed-off-by: Radim Krčmář --- arch/mips/kvm/tlb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 7c6336dd2638..7cd92166a0b9 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -166,7 +166,11 @@ static int _kvm_mips_host_tlb_inv(unsigned long entryhi) int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va, bool user, bool kernel) { - int idx_user, idx_kernel; + /* + * Initialize idx_user and idx_kernel to workaround bogus + * maybe-initialized warning when using GCC 6. + */ + int idx_user = 0, idx_kernel = 0; unsigned long flags, old_entryhi; local_irq_save(flags); From 9e69672e90ccff10dab6f0c9545226a886e5973c Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Wed, 14 Jun 2017 17:13:14 +0200 Subject: [PATCH 060/149] dt-bindings: mfd: Update STM32 timers clock names Clock name has been updated during driver/DT binding review: https://lkml.org/lkml/2016/12/13/718 Update DT binding doc to reflect this. Fixes: 8f9359c6c6a0 (dt-bindings: mfd: Add bindings for STM32 Timers driver) Signed-off-by: Fabrice Gasnier Acked-by: Benjamin Gaignard Signed-off-by: Lee Jones --- Documentation/devicetree/bindings/mfd/stm32-timers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mfd/stm32-timers.txt b/Documentation/devicetree/bindings/mfd/stm32-timers.txt index bbd083f5600a..1db6e0057a63 100644 --- a/Documentation/devicetree/bindings/mfd/stm32-timers.txt +++ b/Documentation/devicetree/bindings/mfd/stm32-timers.txt @@ -31,7 +31,7 @@ Example: compatible = "st,stm32-timers"; reg = <0x40010000 0x400>; clocks = <&rcc 0 160>; - clock-names = "clk_int"; + clock-names = "int"; pwm { compatible = "st,stm32-pwm"; From 05b4017b37f1fce4b7185f138126dd8decdb381f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 15 Jun 2017 10:55:11 -0400 Subject: [PATCH 061/149] drm/amdgpu/atom: fix ps allocation size for EnableDispPowerGating MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were using the wrong structure which lead to an overflow on some boards. bug: https://bugs.freedesktop.org/show_bug.cgi?id=101387 Acked-by: Chunming Zhou Acked-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/atombios_crtc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c b/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c index 8c9bc75a9c2d..8a0818b23ea4 100644 --- a/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c +++ b/drivers/gpu/drm/amd/amdgpu/atombios_crtc.c @@ -165,7 +165,7 @@ void amdgpu_atombios_crtc_powergate(struct drm_crtc *crtc, int state) struct drm_device *dev = crtc->dev; struct amdgpu_device *adev = dev->dev_private; int index = GetIndexIntoMasterTable(COMMAND, EnableDispPowerGating); - ENABLE_DISP_POWER_GATING_PARAMETERS_V2_1 args; + ENABLE_DISP_POWER_GATING_PS_ALLOCATION args; memset(&args, 0, sizeof(args)); @@ -178,7 +178,7 @@ void amdgpu_atombios_crtc_powergate(struct drm_crtc *crtc, int state) void amdgpu_atombios_crtc_powergate_init(struct amdgpu_device *adev) { int index = GetIndexIntoMasterTable(COMMAND, EnableDispPowerGating); - ENABLE_DISP_POWER_GATING_PARAMETERS_V2_1 args; + ENABLE_DISP_POWER_GATING_PS_ALLOCATION args; memset(&args, 0, sizeof(args)); From 52b482b0f4fd6d5267faf29fe91398e203f3c230 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 15 Jun 2017 11:12:28 -0400 Subject: [PATCH 062/149] drm/amdgpu: adjust default display clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase the default display clock on newer asics to accomodate some high res modes with really high refresh rates. bug: https://bugs.freedesktop.org/show_bug.cgi?id=93826 Acked-by: Chunming Zhou Acked-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c index 1cf78f4dd339..1e8e1123ddf4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c @@ -693,6 +693,10 @@ int amdgpu_atombios_get_clock_info(struct amdgpu_device *adev) DRM_INFO("Changing default dispclk from %dMhz to 600Mhz\n", adev->clock.default_dispclk / 100); adev->clock.default_dispclk = 60000; + } else if (adev->clock.default_dispclk <= 60000) { + DRM_INFO("Changing default dispclk from %dMhz to 625Mhz\n", + adev->clock.default_dispclk / 100); + adev->clock.default_dispclk = 62500; } adev->clock.dp_extclk = le16_to_cpu(firmware_info->info_21.usUniphyDPModeExtClkFreq); From 4eb59793cca00b0e629b6d55b5abb5acb82c5868 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 19 Jun 2017 12:52:47 -0400 Subject: [PATCH 063/149] drm/radeon: add a PX quirk for another K53TK variant Disable PX on these systems. bug: https://bugs.freedesktop.org/show_bug.cgi?id=101491 Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_device.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 6ecf42783d4b..0a6444d72000 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -136,6 +136,10 @@ static struct radeon_px_quirk radeon_px_quirk_list[] = { * https://bugzilla.kernel.org/show_bug.cgi?id=51381 */ { PCI_VENDOR_ID_ATI, 0x6840, 0x1043, 0x2122, RADEON_PX_QUIRK_DISABLE_PX }, + /* Asus K53TK laptop with AMD A6-3420M APU and Radeon 7670m GPU + * https://bugs.freedesktop.org/show_bug.cgi?id=101491 + */ + { PCI_VENDOR_ID_ATI, 0x6741, 0x1043, 0x2122, RADEON_PX_QUIRK_DISABLE_PX }, /* macbook pro 8.2 */ { PCI_VENDOR_ID_ATI, 0x6741, PCI_VENDOR_ID_APPLE, 0x00e2, RADEON_PX_QUIRK_LONG_WAKEUP }, { 0, 0, 0, 0, 0 }, From acfd6ee4fa7ebeee75511825fe02be3f7ac1d668 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 19 Jun 2017 15:59:58 -0400 Subject: [PATCH 064/149] drm/radeon: add a quirk for Toshiba Satellite L20-183 Fixes resume from suspend. bug: https://bugzilla.kernel.org/show_bug.cgi?id=196121 Reported-by: Przemek Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_combios.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index 432480ff9d22..3178ba0c537c 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -3393,6 +3393,13 @@ void radeon_combios_asic_init(struct drm_device *dev) rdev->pdev->subsystem_vendor == 0x103c && rdev->pdev->subsystem_device == 0x280a) return; + /* quirk for rs4xx Toshiba Sattellite L20-183 latop to make it resume + * - it hangs on resume inside the dynclk 1 table. + */ + if (rdev->family == CHIP_RS400 && + rdev->pdev->subsystem_vendor == 0x1179 && + rdev->pdev->subsystem_device == 0xff31) + return; /* DYN CLK 1 */ table = combios_get_table_offset(dev, COMBIOS_DYN_CLK_1_TABLE); From 9f93d87cba63e3d18629261243b1f633519eabb5 Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Fri, 9 Jun 2017 09:04:05 +0200 Subject: [PATCH 065/149] irqchip/mips-gic: Mark count and compare accessors notrace gic_read_count(), gic_write_compare() and gic_write_cpu_compare() are often used in a sequence to update the compare register with a count value increased by a small offset. With small delta values used to update the compare register, the time to update function trace for these operations may be longer than the update timeout leading to update failure. Signed-off-by: Marcin Nowakowski Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: linux-mips@linux-mips.org Cc: Jason Cooper Link: http://lkml.kernel.org/r/1496991845-27031-1-git-send-email-marcin.nowakowski@imgtec.com --- drivers/irqchip/irq-mips-gic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index eb7fbe159963..929f8558bf1c 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -140,7 +140,7 @@ static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe) } #ifdef CONFIG_CLKSRC_MIPS_GIC -u64 gic_read_count(void) +u64 notrace gic_read_count(void) { unsigned int hi, hi2, lo; @@ -167,7 +167,7 @@ unsigned int gic_get_count_width(void) return bits; } -void gic_write_compare(u64 cnt) +void notrace gic_write_compare(u64 cnt) { if (mips_cm_is64) { gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE), cnt); @@ -179,7 +179,7 @@ void gic_write_compare(u64 cnt) } } -void gic_write_cpu_compare(u64 cnt, int cpu) +void notrace gic_write_cpu_compare(u64 cnt, int cpu) { unsigned long flags; From 8a7b0d8e8d9962ec3b2ae64dd4e86d68a6fb9220 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 May 2017 08:30:40 +0300 Subject: [PATCH 066/149] CIFS: Set ->should_dirty in cifs_user_readv() The current code causes a static checker warning because ITER_IOVEC is zero so the condition is never true. Fixes: 6685c5e2d1ac ("CIFS: Add asynchronous read support through kernel AIO") Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0fd081bd2a2f..fcef70602b27 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3271,7 +3271,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) if (!is_sync_kiocb(iocb)) ctx->iocb = iocb; - if (to->type & ITER_IOVEC) + if (to->type == ITER_IOVEC) ctx->should_dirty = true; rc = setup_aio_ctx_iter(ctx, to, READ); From ecf3411a121e7a653e309ff50a820ffa87c537f8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 17 May 2017 19:24:15 +0100 Subject: [PATCH 067/149] CIFS: check if pages is null rather than bv for a failed allocation pages is being allocated however a null check on bv is being used to see if the allocation failed. Fix this by checking if pages is null. Detected by CoverityScan, CID#1432974 ("Logically dead code") Fixes: ccf7f4088af2dd ("CIFS: Add asynchronous context to support kernel AIO") Signed-off-by: Colin Ian King Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index b08531977daa..3b147dc6af63 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -810,7 +810,7 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) if (!pages) { pages = vmalloc(max_pages * sizeof(struct page *)); - if (!bv) { + if (!pages) { kvfree(bv); return -ENOMEM; } From dcd87838c06f05ab7650b249ebf0d5b57ae63e1e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 6 Jun 2017 16:58:58 -0700 Subject: [PATCH 068/149] CIFS: Improve readdir verbosity Downgrade the loglevel for SMB2 to prevent filling the log with messages if e.g. readdir was interrupted. Also make SMB2 and SMB1 codepaths do the same logging during readdir. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable --- fs/cifs/smb1ops.c | 9 +++++++-- fs/cifs/smb2ops.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 27bc360c7ffd..a723df3e0197 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -849,8 +849,13 @@ cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *fid, __u16 search_flags, struct cifs_search_info *srch_inf) { - return CIFSFindFirst(xid, tcon, path, cifs_sb, - &fid->netfid, search_flags, srch_inf, true); + int rc; + + rc = CIFSFindFirst(xid, tcon, path, cifs_sb, + &fid->netfid, search_flags, srch_inf, true); + if (rc) + cifs_dbg(FYI, "find first failed=%d\n", rc); + return rc; } static int diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c58691834eb2..59726013375b 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -982,7 +982,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { - cifs_dbg(VFS, "open dir failed\n"); + cifs_dbg(FYI, "open dir failed rc=%d\n", rc); return rc; } @@ -992,7 +992,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, fid->volatile_fid, 0, srch_inf); if (rc) { - cifs_dbg(VFS, "query directory failed\n"); + cifs_dbg(FYI, "query directory failed rc=%d\n", rc); SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } return rc; From e125f5284f81bbb765a504494622b45c02faf978 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 7 Jun 2017 00:33:45 +0100 Subject: [PATCH 069/149] cifs: remove redundant return in cifs_creation_time_get There is a redundant return in function cifs_creation_time_get that appears to be old vestigial code than can be removed. So remove it. Detected by CoverityScan, CID#1361924 ("Structurally dead code") Signed-off-by: Colin Ian King Signed-off-by: Steve French --- fs/cifs/xattr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 3cb5c9e2d4e7..de50e749ff05 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -188,8 +188,6 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, pcreatetime = (__u64 *)value; *pcreatetime = CIFS_I(inode)->createtime; return sizeof(__u64); - - return rc; } From 517a6e43c4872c89794af5b377fa085e47345952 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Sun, 11 Jun 2017 09:12:47 +0200 Subject: [PATCH 070/149] CIFS: Fix some return values in case of error in 'crypt_message' 'rc' is known to be 0 at this point. So if 'init_sg' or 'kzalloc' fails, we should return -ENOMEM instead. Also remove a useless 'rc' in a debug message as it is meaningless here. Fixes: 026e93dc0a3ee ("CIFS: Encrypt SMB3 requests before sending") Signed-off-by: Christophe JAILLET Reviewed-by: Pavel Shilovsky Reviewed-by: Aurelien Aptel Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2ops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 59726013375b..7e48561abd29 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1809,7 +1809,8 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) sg = init_sg(rqst, sign); if (!sg) { - cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc); + cifs_dbg(VFS, "%s: Failed to init sg", __func__); + rc = -ENOMEM; goto free_req; } @@ -1817,6 +1818,7 @@ crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) iv = kzalloc(iv_len, GFP_KERNEL); if (!iv) { cifs_dbg(VFS, "%s: Failed to alloc IV", __func__); + rc = -ENOMEM; goto free_sg; } iv[0] = 3; From e94ac3510b6a0f696f2c442c4fc4051c8101ef12 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 20 Jun 2017 22:28:37 +0200 Subject: [PATCH 071/149] drm: Fix GETCONNECTOR regression In commit 91eefc05f0ac71902906b2058360e61bd25137fe Author: Daniel Vetter Date: Wed Dec 14 00:08:10 2016 +0100 drm: Tighten locking in drm_mode_getconnector I reordered the logic a bit in that IOCTL, but that broke userspace since it'll get the new mode list, but not the new property values. Fix that again. v2: Fix up the error path handling when copy_to_user for the modes failes (Dhinakaran). Fixes: 91eefc05f0ac ("drm: Tighten locking in drm_mode_getconnector") Cc: Sean Paul Cc: Daniel Vetter Cc: Jani Nikula Cc: David Airlie Cc: dri-devel@lists.freedesktop.org Reported-by: "H.J. Lu" Tested-by: "H.J. Lu" Cc: # v4.11+ Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=100576 Cc: "H.J. Lu" Cc: "Pandiyan, Dhinakaran" Reviewed-by: Sean Paul Reviewed-by: Dhinakaran Pandiyan Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20170620202837.1701-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_connector.c | 38 +++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index 9f847615ac74..48ca2457df8c 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -1229,21 +1229,6 @@ int drm_mode_getconnector(struct drm_device *dev, void *data, if (!connector) return -ENOENT; - drm_modeset_lock(&dev->mode_config.connection_mutex, NULL); - encoder = drm_connector_get_encoder(connector); - if (encoder) - out_resp->encoder_id = encoder->base.id; - else - out_resp->encoder_id = 0; - - ret = drm_mode_object_get_properties(&connector->base, file_priv->atomic, - (uint32_t __user *)(unsigned long)(out_resp->props_ptr), - (uint64_t __user *)(unsigned long)(out_resp->prop_values_ptr), - &out_resp->count_props); - drm_modeset_unlock(&dev->mode_config.connection_mutex); - if (ret) - goto out_unref; - for (i = 0; i < DRM_CONNECTOR_MAX_ENCODER; i++) if (connector->encoder_ids[i] != 0) encoders_count++; @@ -1256,7 +1241,7 @@ int drm_mode_getconnector(struct drm_device *dev, void *data, if (put_user(connector->encoder_ids[i], encoder_ptr + copied)) { ret = -EFAULT; - goto out_unref; + goto out; } copied++; } @@ -1300,15 +1285,32 @@ int drm_mode_getconnector(struct drm_device *dev, void *data, if (copy_to_user(mode_ptr + copied, &u_mode, sizeof(u_mode))) { ret = -EFAULT; + mutex_unlock(&dev->mode_config.mutex); + goto out; } copied++; } } out_resp->count_modes = mode_count; -out: mutex_unlock(&dev->mode_config.mutex); -out_unref: + + drm_modeset_lock(&dev->mode_config.connection_mutex, NULL); + encoder = drm_connector_get_encoder(connector); + if (encoder) + out_resp->encoder_id = encoder->base.id; + else + out_resp->encoder_id = 0; + + /* Only grab properties after probing, to make sure EDID and other + * properties reflect the latest status. */ + ret = drm_mode_object_get_properties(&connector->base, file_priv->atomic, + (uint32_t __user *)(unsigned long)(out_resp->props_ptr), + (uint64_t __user *)(unsigned long)(out_resp->prop_values_ptr), + &out_resp->count_props); + drm_modeset_unlock(&dev->mode_config.connection_mutex); + +out: drm_connector_put(connector); return ret; From 8a1898db51a3390241cd5fae267dc8aaa9db0f8b Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 20 Jun 2017 12:26:39 +0200 Subject: [PATCH 072/149] perf/aux: Correct return code of rb_alloc_aux() if !has_aux(ev) If the event for which an AUX area is about to be allocated, does not support setting up an AUX area, rb_alloc_aux() return -ENOTSUPP. This error condition is being returned unfiltered to the user space, and, for example, the perf tools fails with: failed to mmap with 524 (INTERNAL ERROR: strerror_r(524, 0x3fff497a1c8, 512)=22) This error can be easily seen with "perf record -m 128,256 -e cpu-clock". The 524 error code maps to -ENOTSUPP (in rb_alloc_aux()). The -ENOTSUPP error code shall be only used within the kernel. So the correct error code would then be -EOPNOTSUPP. With this commit, the perf tool then reports: failed to mmap with 95 (Operation not supported) which is more clear. Signed-off-by: Hendrik Brueckner Acked-by: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Pu Hou Cc: Thomas Gleixner Cc: Thomas-Mich Richter Cc: acme@kernel.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/1497954399-6355-1-git-send-email-brueckner@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 2831480c63a2..ee97196bb151 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, int ret = -ENOMEM, max_order = 0; if (!has_aux(event)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { /* From 7def52b78a5fda14864aab9b6fd14f09a4d4ff72 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 19 Jun 2017 10:55:47 -0400 Subject: [PATCH 073/149] dm integrity: fix to not disable/enable interrupts from interrupt context Use spin_lock_irqsave and spin_unlock_irqrestore rather than spin_{lock,unlock}_irq in submit_flush_bio(). Otherwise lockdep issues the following warning: DEBUG_LOCKS_WARN_ON(current->hardirq_context) WARNING: CPU: 1 PID: 0 at kernel/locking/lockdep.c:2748 trace_hardirqs_on_caller+0x107/0x180 Reported-by: Ondrej Kozina Tested-by: Ondrej Kozina Signed-off-by: Mike Snitzer Acked-by: Mikulas Patocka --- drivers/md/dm-integrity.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 4ab10cf718c9..93b181088168 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1105,10 +1105,13 @@ static void schedule_autocommit(struct dm_integrity_c *ic) static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio) { struct bio *bio; - spin_lock_irq(&ic->endio_wait.lock); + unsigned long flags; + + spin_lock_irqsave(&ic->endio_wait.lock, flags); bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); bio_list_add(&ic->flush_bio_list, bio); - spin_unlock_irq(&ic->endio_wait.lock); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); + queue_work(ic->commit_wq, &ic->commit_work); } From feb7695fe9fb83084aa29de0094774f4c9d4c9fc Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 20 Jun 2017 19:14:30 -0400 Subject: [PATCH 074/149] dm io: fix duplicate bio completion due to missing ref count If only a subset of the devices associated with multiple regions support a given special operation (eg. DISCARD) then the dec_count() that is used to set error for the region must increment the io->count. Otherwise, when the dec_count() is called it can cause the dm-io caller's bio to be completed multiple times. As was reported against the dm-mirror target that had mirror legs with a mix of discard capabilities. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=196077 Reported-by: Zhang Yi Signed-off-by: Mike Snitzer --- drivers/md/dm-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3702e502466d..8d5ca30f6551 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -317,8 +317,8 @@ static void do_region(int op, int op_flags, unsigned region, else if (op == REQ_OP_WRITE_SAME) special_cmd_max_sectors = q->limits.max_write_same_sectors; if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES || - op == REQ_OP_WRITE_SAME) && - special_cmd_max_sectors == 0) { + op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) { + atomic_inc(&io->count); dec_count(io, region, -EOPNOTSUPP); return; } From 8e8320c9315c47a6a090188720ccff32a6a6ba18 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Jun 2017 17:56:13 -0600 Subject: [PATCH 075/149] blk-mq: fix performance regression with shared tags If we have shared tags enabled, then every IO completion will trigger a full loop of every queue belonging to a tag set, and every hardware queue for each of those queues, even if nothing needs to be done. This causes a massive performance regression if you have a lot of shared devices. Instead of doing this huge full scan on every IO, add an atomic counter to the main queue that tracks how many hardware queues have been marked as needing a restart. With that, we can avoid looking for restartable queues, if we don't have to. Max reports that this restores performance. Before this patch, 4K IOPS was limited to 22-23K IOPS. With the patch, we are running at 950-970K IOPS. Fixes: 6d8c6c0f97ad ("blk-mq: Restart a single queue if tag sets are shared") Reported-by: Max Gurtovoy Tested-by: Max Gurtovoy Reviewed-by: Bart Van Assche Tested-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 58 +++++++++++++++++++++++++++++++++--------- block/blk-mq-sched.h | 9 ------- block/blk-mq.c | 16 +++++++++--- include/linux/blkdev.h | 2 ++ 4 files changed, 61 insertions(+), 24 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 1f5b692526ae..0ded5e846335 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -68,6 +68,45 @@ static void blk_mq_sched_assign_ioc(struct request_queue *q, __blk_mq_sched_assign_ioc(q, rq, bio, ioc); } +/* + * Mark a hardware queue as needing a restart. For shared queues, maintain + * a count of how many hardware queues are marked for restart. + */ +static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_inc(&q->shared_hctx_restart); + } else + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} + +static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return false; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_dec(&q->shared_hctx_restart); + } else + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + + if (blk_mq_hctx_has_pending(hctx)) { + blk_mq_run_hw_queue(hctx, true); + return true; + } + + return false; +} + struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, @@ -266,18 +305,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) -{ - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - if (blk_mq_hctx_has_pending(hctx)) { - blk_mq_run_hw_queue(hctx, true); - return true; - } - } - return false; -} - /** * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list * @pos: loop cursor. @@ -309,6 +336,13 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) unsigned int i, j; if (set->flags & BLK_MQ_F_TAG_SHARED) { + /* + * If this is 0, then we know that no hardware queues + * have RESTART marked. We're done. + */ + if (!atomic_read(&queue->shared_hctx_restart)) + return; + rcu_read_lock(); list_for_each_entry_rcu_rr(q, queue, &set->tag_list, tag_set_list) { diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index edafb5383b7b..5007edece51a 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -115,15 +115,6 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) return false; } -/* - * Mark a hardware queue as needing a restart. - */ -static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) -{ - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); -} - static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); diff --git a/block/blk-mq.c b/block/blk-mq.c index bb66c96850b1..958cedaff8b8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2103,20 +2103,30 @@ static void blk_mq_map_swqueue(struct request_queue *q, } } +/* + * Caller needs to ensure that we're either frozen/quiesced, or that + * the queue isn't live yet. + */ static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) + if (shared) { + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_inc(&q->shared_hctx_restart); hctx->flags |= BLK_MQ_F_TAG_SHARED; - else + } else { + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + atomic_dec(&q->shared_hctx_restart); hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, + bool shared) { struct request_queue *q; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b74a3edcb3da..1ddd36bd2173 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -391,6 +391,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + atomic_t shared_hctx_restart; + struct blk_queue_stats *stats; struct rq_wb *rq_wb; From e4330d8bf669139a983255d1801733b64c2ae841 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Mon, 19 Jun 2017 15:53:01 +0300 Subject: [PATCH 076/149] ACPI / scan: Fix enumeration for special SPI and I2C devices Commit f406270bf73d ("ACPI / scan: Set the visited flag for all enumerated devices") caused that two group of special SPI or I2C devices do not enumerate. SPI and I2C devices are expected to be enumerated by the SPI and I2C subsystems but change caused that acpi_bus_attach() marks those devices with acpi_device_set_enumerated(). First group of devices are matched using Device Tree compatible property with special _HID "PRP0001". Those devices have matched scan handler, acpi_scan_attach_handler() retuns 1 and acpi_bus_attach() marks them with acpi_device_set_enumerated(). Second group of devices without valid _HID such as "LNXVIDEO" have device->pnp.type.platform_id set to zero and change again marks them with acpi_device_set_enumerated(). Fix this by flagging the SPI and I2C devices during struct acpi_device object initialization time and let the code in acpi_bus_attach() to go through the device_attach() and acpi_default_enumeration() path for all SPI and I2C devices. Fixes: f406270bf73d (ACPI / scan: Set the visited flag for all enumerated devices) Signed-off-by: Jarkko Nikula Acked-by: Mika Westerberg Cc: 4.11+ # 4.11+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 67 +++++++++++++++++++++++------------------ include/acpi/acpi_bus.h | 3 +- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 3a10d7573477..d53162997f32 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1428,6 +1428,37 @@ static void acpi_init_coherency(struct acpi_device *adev) adev->flags.coherent_dma = cca; } +static int acpi_check_spi_i2c_slave(struct acpi_resource *ares, void *data) +{ + bool *is_spi_i2c_slave_p = data; + + if (ares->type != ACPI_RESOURCE_TYPE_SERIAL_BUS) + return 1; + + /* + * devices that are connected to UART still need to be enumerated to + * platform bus + */ + if (ares->data.common_serial_bus.type != ACPI_RESOURCE_SERIAL_TYPE_UART) + *is_spi_i2c_slave_p = true; + + /* no need to do more checking */ + return -1; +} + +static bool acpi_is_spi_i2c_slave(struct acpi_device *device) +{ + struct list_head resource_list; + bool is_spi_i2c_slave = false; + + INIT_LIST_HEAD(&resource_list); + acpi_dev_get_resources(device, &resource_list, acpi_check_spi_i2c_slave, + &is_spi_i2c_slave); + acpi_dev_free_resource_list(&resource_list); + + return is_spi_i2c_slave; +} + void acpi_init_device_object(struct acpi_device *device, acpi_handle handle, int type, unsigned long long sta) { @@ -1443,6 +1474,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle, acpi_bus_get_flags(device); device->flags.match_driver = false; device->flags.initialized = true; + device->flags.spi_i2c_slave = acpi_is_spi_i2c_slave(device); acpi_device_clear_enumerated(device); device_initialize(&device->dev); dev_set_uevent_suppress(&device->dev, true); @@ -1727,38 +1759,13 @@ static acpi_status acpi_bus_check_add(acpi_handle handle, u32 lvl_not_used, return AE_OK; } -static int acpi_check_spi_i2c_slave(struct acpi_resource *ares, void *data) -{ - bool *is_spi_i2c_slave_p = data; - - if (ares->type != ACPI_RESOURCE_TYPE_SERIAL_BUS) - return 1; - - /* - * devices that are connected to UART still need to be enumerated to - * platform bus - */ - if (ares->data.common_serial_bus.type != ACPI_RESOURCE_SERIAL_TYPE_UART) - *is_spi_i2c_slave_p = true; - - /* no need to do more checking */ - return -1; -} - static void acpi_default_enumeration(struct acpi_device *device) { - struct list_head resource_list; - bool is_spi_i2c_slave = false; - /* * Do not enumerate SPI/I2C slaves as they will be enumerated by their * respective parents. */ - INIT_LIST_HEAD(&resource_list); - acpi_dev_get_resources(device, &resource_list, acpi_check_spi_i2c_slave, - &is_spi_i2c_slave); - acpi_dev_free_resource_list(&resource_list); - if (!is_spi_i2c_slave) { + if (!device->flags.spi_i2c_slave) { acpi_create_platform_device(device, NULL); acpi_device_set_enumerated(device); } else { @@ -1854,7 +1861,7 @@ static void acpi_bus_attach(struct acpi_device *device) return; device->flags.match_driver = true; - if (ret > 0) { + if (ret > 0 && !device->flags.spi_i2c_slave) { acpi_device_set_enumerated(device); goto ok; } @@ -1863,10 +1870,10 @@ static void acpi_bus_attach(struct acpi_device *device) if (ret < 0) return; - if (device->pnp.type.platform_id) - acpi_default_enumeration(device); - else + if (!device->pnp.type.platform_id && !device->flags.spi_i2c_slave) acpi_device_set_enumerated(device); + else + acpi_default_enumeration(device); ok: list_for_each_entry(child, &device->children, node) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 197f3fffc9a7..408c7820e200 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -210,7 +210,8 @@ struct acpi_device_flags { u32 of_compatible_ok:1; u32 coherent_dma:1; u32 cca_seen:1; - u32 reserved:20; + u32 spi_i2c_slave:1; + u32 reserved:19; }; /* File System */ From 2f263d145140ea4b9f5762b15886ae26195a764a Mon Sep 17 00:00:00 2001 From: Richard Genoud Date: Thu, 15 Jun 2017 10:36:22 +0200 Subject: [PATCH 077/149] kbuild: fix header installation under fakechroot environment Since commit fcc8487d477a ("uapi: export all headers under uapi directories") fakechroot make bindeb-pkg fails, mismatching files for directories: touch: cannot touch 'usr/include/video/uvesafb.h/.install': Not a directory This due to a bug in fakechroot: when using the function $(wildcard $(srcdir)/*/.) in a makefile, under a fakechroot environment, not only directories but also files are returned. To circumvent that, we are using the functions: $(sort $(dir $(wildcard $(srcdir)/*/)))) Fixes: fcc8487d477a ("uapi: export all headers under uapi directories") Signed-off-by: Richard Genoud Signed-off-by: Masahiro Yamada --- scripts/Makefile.headersinst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.headersinst b/scripts/Makefile.headersinst index ce753a408c56..c583a1e1bd3c 100644 --- a/scripts/Makefile.headersinst +++ b/scripts/Makefile.headersinst @@ -14,7 +14,15 @@ __headers: include scripts/Kbuild.include srcdir := $(srctree)/$(obj) -subdirs := $(patsubst $(srcdir)/%/.,%,$(wildcard $(srcdir)/*/.)) + +# When make is run under a fakechroot environment, the function +# $(wildcard $(srcdir)/*/.) doesn't only return directories, but also regular +# files. So, we are using a combination of sort/dir/wildcard which works +# with fakechroot. +subdirs := $(patsubst $(srcdir)/%/,%,\ + $(filter-out $(srcdir)/,\ + $(sort $(dir $(wildcard $(srcdir)/*/))))) + # caller may set destination dir (when installing to asm/) _dst := $(if $(dst),$(dst),$(obj)) From eb5e248d502bec191bd99f04cae8b49992b3abde Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 21 Jun 2017 20:27:35 -0700 Subject: [PATCH 078/149] xfs: don't allow bmap on rt files bmap returns a dumb LBA address but not the block device that goes with that LBA. Swapfiles don't care about this and will blindly assume that the data volume is the correct blockdev, which is totally bogus for files on the rt subvolume. This results in the swap code doing IOs to arbitrary locations on the data device(!) if the passed in mapping is a realtime file, so just turn off bmap for rt files. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_aops.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 09af0f7cd55e..3b91faacc1ba 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1316,9 +1316,12 @@ xfs_vm_bmap( * The swap code (ab-)uses ->bmap to get a block mapping and then * bypasseѕ the file system for actual I/O. We really can't allow * that on reflinks inodes, so we have to skip out here. And yes, - * 0 is the magic code for a bmap error.. + * 0 is the magic code for a bmap error. + * + * Since we don't pass back blockdev info, we can't return bmap + * information for rt files either. */ - if (xfs_is_reflink_inode(ip)) + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; filemap_write_and_wait(mapping); From 7c88e21aefcf86fb41b48b2e04528db5a30fbe18 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Thu, 22 Jun 2017 11:37:10 +0300 Subject: [PATCH 079/149] xfrm6: Fix IPv6 payload_len in xfrm6_transport_finish IPv6 payload length indicates the size of the payload, including any extension headers. In xfrm6_transport_finish, ipv6_hdr(skb)->payload_len is set to the payload size only, regardless of the presence of any extension headers. After ESP GRO transport mode decapsulation, ipv6_rcv trims the packet according to the wrong payload_len, thus corrupting the packet. Set payload_len to account for extension headers as well. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Signed-off-by: Yossi Kuperman Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 08a807b29298..3ef5d913e7a3 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -43,8 +43,8 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) return 1; #endif - ipv6_hdr(skb)->payload_len = htons(skb->len); __skb_push(skb, skb->data - skb_network_header(skb)); + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); From ca3a1b856636f596c691ab5b3764045a142186db Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Thu, 22 Jun 2017 11:37:11 +0300 Subject: [PATCH 080/149] esp6_offload: Fix IP6CB(skb)->nhoff for ESP GRO IP6CB(skb)->nhoff is the offset of the nexthdr field in an IPv6 header, unless there are extension headers present, in which case nhoff points to the nexthdr field of the last extension header. In non-GRO code path, nhoff is set by ipv6_rcv before any XFRM code is executed. Conversely, in GRO code path (when esp6_offload is loaded), nhoff is not set. The following functions fail to read the correct value and eventually the packet is dropped: xfrm6_transport_finish xfrm6_tunnel_input xfrm6_rcv_tnl Set nhoff to the proper offset of nexthdr in esp6_gro_receive. Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") Signed-off-by: Yossi Kuperman Signed-off-by: Steffen Klassert --- net/ipv6/esp6_offload.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d950d43ba255..f02f131f6435 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -30,6 +30,25 @@ #include #include +static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen) +{ + int off = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr; + + if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP)) + return offsetof(struct ipv6hdr, nexthdr); + + while (off < nhlen) { + exthdr = (void *)ipv6_hdr + off; + if (exthdr->nexthdr == NEXTHDR_ESP) + return off; + + off += ipv6_optlen(exthdr); + } + + return 0; +} + static struct sk_buff **esp6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -38,6 +57,7 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, struct xfrm_state *x; __be32 seq; __be32 spi; + int nhoff; int err; skb_pull(skb, offset); @@ -72,6 +92,11 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, xo->flags |= XFRM_GRO; + nhoff = esp6_nexthdr_esp_offset(ipv6_hdr(skb), offset); + if (!nhoff) + goto out; + + IP6CB(skb)->nhoff = nhoff; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); From 6c782a5ea56a799658e213a78dc1455264938afa Mon Sep 17 00:00:00 2001 From: Michail Georgios Etairidis Date: Tue, 20 Jun 2017 10:20:42 +0200 Subject: [PATCH 081/149] i2c: imx: Use correct function to write to register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The i2c-imx driver incorrectly uses readb()/writeb() to read and write to the appropriate registers when performing a repeated start. The appropriate imx_i2c_read_reg()/imx_i2c_write_reg() functions should be used instead. Performing a repeated start results in a kernel panic. The platform is imx. Signed-off-by: Michail G Etairidis Fixes: ce1a78840ff7 ("i2c: imx: add DMA support for freescale i2c driver") Fixes: 054b62d9f25c ("i2c: imx: fix the i2c bus hang issue when do repeat restart") Acked-by: Fugang Duan Acked-by: Uwe Kleine-König Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-imx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index 95ed17183e73..54a47b40546f 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -734,9 +734,9 @@ static int i2c_imx_dma_read(struct imx_i2c_struct *i2c_imx, * the first read operation, otherwise the first read cost * one extra clock cycle. */ - temp = readb(i2c_imx->base + IMX_I2C_I2CR); + temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR); temp |= I2CR_MTX; - writeb(temp, i2c_imx->base + IMX_I2C_I2CR); + imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); } msgs->buf[msgs->len-1] = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2DR); @@ -857,9 +857,9 @@ static int i2c_imx_read(struct imx_i2c_struct *i2c_imx, struct i2c_msg *msgs, bo * the first read operation, otherwise the first read cost * one extra clock cycle. */ - temp = readb(i2c_imx->base + IMX_I2C_I2CR); + temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR); temp |= I2CR_MTX; - writeb(temp, i2c_imx->base + IMX_I2C_I2CR); + imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); } } else if (i == (msgs->len - 2)) { dev_dbg(&i2c_imx->adapter.dev, From fb3a5055cd7098f8d1dd0cd38d7172211113255f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 19 Jun 2017 07:26:09 -0700 Subject: [PATCH 082/149] perf/x86/intel: Add 1G DTLB load/store miss support for SKL Current DTLB load/store miss events (0x608/0x649) only counts 4K,2M and 4M page size. Need to extend the events to support any page size (4K/2M/4M/1G). The complete DTLB load/store miss events are: DTLB_LOAD_MISSES.WALK_COMPLETED 0xe08 DTLB_STORE_MISSES.WALK_COMPLETED 0xe49 Signed-off-by: Kan Liang Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Link: http://lkml.kernel.org/r/20170619142609.11058-1-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a6d91d4e37a1..110ce8238466 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -431,11 +431,11 @@ static __initconst const u64 skl_hw_cache_event_ids [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ + [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */ + [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x0, From addb63c18a0d52a9ce2611d039f981f7b6148d2b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 19 Jun 2017 08:02:28 +0200 Subject: [PATCH 083/149] KVM: s390: gaccess: fix real-space designation asce handling for gmap shadows For real-space designation asces the asce origin part is only a token. The asce token origin must not be used to generate an effective address for storage references. This however is erroneously done within kvm_s390_shadow_tables(). Furthermore within the same function the wrong parts of virtual addresses are used to generate a corresponding real address (e.g. the region second index is used as region first index). Both of the above can result in incorrect address translations. Only for real space designations with a token origin of zero and addresses below one megabyte the translation was correct. Furthermore replace a "!asce.r" statement with a "!*fake" statement to make it more obvious that a specific condition has nothing to do with the architecture, but with the fake handling of real space designations. Fixes: 3218f7094b6b ("s390/mm: support real-space for gmap shadows") Cc: David Hildenbrand Cc: stable@vger.kernel.org Signed-off-by: Heiko Carstens Reviewed-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 9da243d94cc3..3b297fa3aa67 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -977,11 +977,12 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, ptr = asce.origin * 4096; if (asce.r) { *fake = 1; + ptr = 0; asce.dt = ASCE_TYPE_REGION1; } switch (asce.dt) { case ASCE_TYPE_REGION1: - if (vaddr.rfx01 > asce.tl && !asce.r) + if (vaddr.rfx01 > asce.tl && !*fake) return PGM_REGION_FIRST_TRANS; break; case ASCE_TYPE_REGION2: @@ -1009,8 +1010,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, union region1_table_entry rfte; if (*fake) { - /* offset in 16EB guest memory block */ - ptr = ptr + ((unsigned long) vaddr.rsx << 53UL); + ptr += (unsigned long) vaddr.rfx << 53; rfte.val = ptr; goto shadow_r2t; } @@ -1036,8 +1036,7 @@ shadow_r2t: union region2_table_entry rste; if (*fake) { - /* offset in 8PB guest memory block */ - ptr = ptr + ((unsigned long) vaddr.rtx << 42UL); + ptr += (unsigned long) vaddr.rsx << 42; rste.val = ptr; goto shadow_r3t; } @@ -1064,8 +1063,7 @@ shadow_r3t: union region3_table_entry rtte; if (*fake) { - /* offset in 4TB guest memory block */ - ptr = ptr + ((unsigned long) vaddr.sx << 31UL); + ptr += (unsigned long) vaddr.rtx << 31; rtte.val = ptr; goto shadow_sgt; } @@ -1101,8 +1099,7 @@ shadow_sgt: union segment_table_entry ste; if (*fake) { - /* offset in 2G guest memory block */ - ptr = ptr + ((unsigned long) vaddr.sx << 20UL); + ptr += (unsigned long) vaddr.sx << 20; ste.val = ptr; goto shadow_pgt; } From bbd5ff50afffcf4a01d05367524736c57607a478 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 20 Jun 2017 18:37:28 +1000 Subject: [PATCH 084/149] powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD NPU2 requires an extra explicit flush to an active GPU PID when sending address translation shoot downs (ATSDs) to reliably flush the GPU TLB. This patch adds just such a flush at the end of each sequence of ATSDs. We can safely use PID 0 which is always reserved and active on the GPU. PID 0 is only used for init_mm which will never be a user mm on the GPU. To enforce this we add a check in pnv_npu2_init_context() just in case someone tries to use PID 0 on the GPU. Signed-off-by: Alistair Popple [mpe: Use true/false for bool literals] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 94 ++++++++++++++++-------- 1 file changed, 65 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index e6f444b46207..b5d960d6db3d 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -449,7 +449,7 @@ static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, return mmio_atsd_reg; } -static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) +static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) { unsigned long launch; @@ -465,12 +465,15 @@ static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) /* PID */ launch |= pid << PPC_BITLSHIFT(38); + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + /* Invalidating the entire process doesn't use a va */ return mmio_launch_invalidate(npu, launch, 0); } static int mmio_invalidate_va(struct npu *npu, unsigned long va, - unsigned long pid) + unsigned long pid, bool flush) { unsigned long launch; @@ -486,26 +489,60 @@ static int mmio_invalidate_va(struct npu *npu, unsigned long va, /* PID */ launch |= pid << PPC_BITLSHIFT(38); + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + return mmio_launch_invalidate(npu, launch, va); } #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) +struct mmio_atsd_reg { + struct npu *npu; + int reg; +}; + +static void mmio_invalidate_wait( + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush) +{ + struct npu *npu; + int i, reg; + + /* Wait for all invalidations to complete */ + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* Wait for completion */ + npu = mmio_atsd_reg[i].npu; + reg = mmio_atsd_reg[i].reg; + while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) + cpu_relax(); + + put_mmio_atsd_reg(npu, reg); + + /* + * The GPU requires two flush ATSDs to ensure all entries have + * been flushed. We use PID 0 as it will never be used for a + * process on the GPU. + */ + if (flush) + mmio_invalidate_pid(npu, 0, true); + } +} + /* * Invalidate either a single address or an entire PID depending on * the value of va. */ static void mmio_invalidate(struct npu_context *npu_context, int va, - unsigned long address) + unsigned long address, bool flush) { - int i, j, reg; + int i, j; struct npu *npu; struct pnv_phb *nphb; struct pci_dev *npdev; - struct { - struct npu *npu; - int reg; - } mmio_atsd_reg[NV_MAX_NPUS]; + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; /* @@ -525,10 +562,11 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, if (va) mmio_atsd_reg[i].reg = - mmio_invalidate_va(npu, address, pid); + mmio_invalidate_va(npu, address, pid, + flush); else mmio_atsd_reg[i].reg = - mmio_invalidate_pid(npu, pid); + mmio_invalidate_pid(npu, pid, flush); /* * The NPU hardware forwards the shootdown to all GPUs @@ -544,18 +582,10 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, */ flush_tlb_mm(npu_context->mm); - /* Wait for all invalidations to complete */ - for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) - continue; - - /* Wait for completion */ - npu = mmio_atsd_reg[i].npu; - reg = mmio_atsd_reg[i].reg; - while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) - cpu_relax(); - put_mmio_atsd_reg(npu, reg); - } + mmio_invalidate_wait(mmio_atsd_reg, flush); + if (flush) + /* Wait for the flush to complete */ + mmio_invalidate_wait(mmio_atsd_reg, false); } static void pnv_npu2_mn_release(struct mmu_notifier *mn, @@ -571,7 +601,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, * There should be no more translation requests for this PID, but we * need to ensure any entries for it are removed from the TLB. */ - mmio_invalidate(npu_context, 0, 0); + mmio_invalidate(npu_context, 0, 0, true); } static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, @@ -581,7 +611,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, { struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, 1, address); + mmio_invalidate(npu_context, 1, address, true); } static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, @@ -590,7 +620,7 @@ static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, { struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, 1, address); + mmio_invalidate(npu_context, 1, address, true); } static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, @@ -600,8 +630,11 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct npu_context *npu_context = mn_to_npu_context(mn); unsigned long address; - for (address = start; address <= end; address += PAGE_SIZE) - mmio_invalidate(npu_context, 1, address); + for (address = start; address < end; address += PAGE_SIZE) + mmio_invalidate(npu_context, 1, address, false); + + /* Do the flush only on the final addess == end */ + mmio_invalidate(npu_context, 1, address, true); } static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { @@ -651,8 +684,11 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, /* No nvlink associated with this GPU device */ return ERR_PTR(-ENODEV); - if (!mm) { - /* kernel thread contexts are not supported */ + if (!mm || mm->context.id == 0) { + /* + * Kernel thread contexts are not supported and context id 0 is + * reserved on the GPU. + */ return ERR_PTR(-EINVAL); } From c8401dda2f0a00cd25c0af6a95ed50e478d25de4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 7 Jun 2017 15:13:14 +0200 Subject: [PATCH 085/149] KVM: x86: fix singlestepping over syscall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TF is handled a bit differently for syscall and sysret, compared to the other instructions: TF is checked after the instruction completes, so that the OS can disable #DB at a syscall by adding TF to FMASK. When the sysret is executed the #DB is taken "as if" the syscall insn just completed. KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. Fix the behavior, otherwise you could get #DB on a user stack which is not nice. This does not affect Linux guests, as they use an IST or task gate for #DB. This fixes CVE-2017-7518. Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/x86.c | 62 +++++++++++++++--------------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 055962615779..722d0e568863 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -296,6 +296,7 @@ struct x86_emulate_ctxt { bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ + bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0816ab2e8adc..80890dee66ce 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2742,6 +2742,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); } + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 87d3cb901935..0e846f0cb83b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5313,6 +5313,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; + ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : @@ -5528,36 +5530,25 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run; - /* - * rflags is the old, "raw" value of the flags. The new value has - * not been saved yet. - * - * This is correct even for TF set by the guest, because "the - * processor will not generate this exception after the instruction - * that sets the TF flag". - */ - if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | - DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5567,7 +5558,17 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) int r = EMULATE_DONE; kvm_x86_ops->skip_emulated_instruction(vcpu); - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + + /* + * rflags is the old, "raw" value of the flags. The new value has + * not been saved yet. + * + * This is correct even for TF set by the guest, because "the + * processor will not generate this exception after the instruction + * that sets the TF flag". + */ + if (unlikely(rflags & X86_EFLAGS_TF)) + kvm_vcpu_do_singlestep(vcpu, &r); return r == EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -5726,8 +5727,9 @@ restart: toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + if (r == EMULATE_DONE && + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags); From b866203d872d5deeafcecd25ea429d6748b5bd56 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 20 Jun 2017 12:48:11 -0500 Subject: [PATCH 086/149] net/phy: micrel: configure intterupts after autoneg workaround The commit ("net/phy: micrel: Add workaround for bad autoneg") fixes an autoneg failure case by resetting the hardware. This turns off intterupts. Things will work themselves out if the phy polls, as it will figure out it's state during a poll. However if the phy uses only intterupts, the phy will stall, since interrupts are off. This patch fixes the issue by calling config_intr after resetting the phy. Fixes: d2fd719bcb0e ("net/phy: micrel: Add workaround for bad autoneg ") Signed-off-by: Zach Brown Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index b9252b8d81ff..8b2038844ba9 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -619,6 +619,8 @@ static int ksz9031_read_status(struct phy_device *phydev) if ((regval & 0xFF) == 0xFF) { phy_init_hw(phydev); phydev->link = 0; + if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev)) + phydev->drv->config_intr(phydev); } return 0; From 76da0704507bbc51875013f6557877ab308cfd0a Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 20 Jun 2017 11:42:27 -0700 Subject: [PATCH 087/149] ipv6: only call ip6_route_dev_notify() once for NETDEV_UNREGISTER In commit 242d3a49a2a1 ("ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf") I assumed NETDEV_REGISTER and NETDEV_UNREGISTER are paired, unfortunately, as reported by jeffy, netdev_wait_allrefs() could rebroadcast NETDEV_UNREGISTER event until all refs are gone. We have to add an additional check to avoid this corner case. For netdev_wait_allrefs() dev->reg_state is NETREG_UNREGISTERED, for dev_change_net_namespace(), dev->reg_state is NETREG_REGISTERED. So check for dev->reg_state != NETREG_UNREGISTERED. Fixes: 242d3a49a2a1 ("ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf") Reported-by: jeffy Cc: David Ahern Signed-off-by: Cong Wang Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7cebd954d5bb..322bd62e688b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3722,7 +3722,11 @@ static int ip6_route_dev_notify(struct notifier_block *this, net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif - } else if (event == NETDEV_UNREGISTER) { + } else if (event == NETDEV_UNREGISTER && + dev->reg_state != NETREG_UNREGISTERED) { + /* NETDEV_UNREGISTER could be fired for multiple times by + * netdev_wait_allrefs(). Make sure we only call this once. + */ in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); From b88ff4f8c9ef0279f82bca0a83ed860b654f0f8d Mon Sep 17 00:00:00 2001 From: Lokesh Vutla Date: Wed, 21 Jun 2017 14:12:04 +0530 Subject: [PATCH 088/149] drivers: net: cpsw-common: Fix reading of mac address for am43 SoCs cpsw driver tries to get macid for am43xx SoCs using the compatible ti,am4372. But not all variants of am43x uses this complatible like epos evm uses ti,am438x. So use a generic compatible ti,am43 to get macid for all am43 based platforms. Reviewed-by: Dave Gerlach Signed-off-by: Lokesh Vutla Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw-common.c b/drivers/net/ethernet/ti/cpsw-common.c index 1562ab4151e1..56ba411421f0 100644 --- a/drivers/net/ethernet/ti/cpsw-common.c +++ b/drivers/net/ethernet/ti/cpsw-common.c @@ -90,7 +90,7 @@ int ti_cm_get_macid(struct device *dev, int slave, u8 *mac_addr) if (of_device_is_compatible(dev->of_node, "ti,dm816-emac")) return cpsw_am33xx_cm_get_macid(dev, 0x30, slave, mac_addr); - if (of_machine_is_compatible("ti,am4372")) + if (of_machine_is_compatible("ti,am43")) return cpsw_am33xx_cm_get_macid(dev, 0x630, slave, mac_addr); if (of_machine_is_compatible("ti,dra7")) From 191cdb3822e5df6b3c8b9f8cb8c4bf93f6cc90c7 Mon Sep 17 00:00:00 2001 From: Serhey Popovych Date: Wed, 21 Jun 2017 12:12:24 +0300 Subject: [PATCH 089/149] veth: Be more robust on network device creation when no attributes There are number of problems with configuration peer network device in absence of IFLA_VETH_PEER attributes where attributes for main network device shared with peer. First it is not feasible to configure both network devices with same MAC address since this makes communication in such configuration problematic. This case can be reproduced with following sequence: # ip link add address 02:11:22:33:44:55 type veth # ip li sh ... 26: veth0@veth1: mtu 1500 qdisc \ noop state DOWN mode DEFAULT qlen 1000 link/ether 00:11:22:33:44:55 brd ff:ff:ff:ff:ff:ff 27: veth1@veth0: mtu 1500 qdisc \ noop state DOWN mode DEFAULT qlen 1000 link/ether 00:11:22:33:44:55 brd ff:ff:ff:ff:ff:ff Second it is not possible to register both main and peer network devices with same name, that happens when name for main interface is given with IFLA_IFNAME and same attribute reused for peer. This case can be reproduced with following sequence: # ip link add dev veth1a type veth RTNETLINK answers: File exists To fix both of the cases check if corresponding netlink attributes are taken from peer_tb when valid or name based on rtnl ops kind and random address is used. Signed-off-by: Serhey Popovych Signed-off-by: David S. Miller --- drivers/net/veth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 0156fe8cac17..364fa9d11d1a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -383,7 +383,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, tbp = tb; } - if (tbp[IFLA_IFNAME]) { + if (ifmp && tbp[IFLA_IFNAME]) { nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { @@ -402,7 +402,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, return PTR_ERR(peer); } - if (tbp[IFLA_ADDRESS] == NULL) + if (!ifmp || !tbp[IFLA_ADDRESS]) eth_hw_addr_random(peer); if (ifmp && (dev->ifindex != 0)) From dfa523ae9f2542bee4cddaea37b3be3e157f6e6b Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 21 Jun 2017 10:21:22 +0100 Subject: [PATCH 090/149] xen-netback: correctly schedule rate-limited queues Add a flag to indicate if a queue is rate-limited. Test the flag in NAPI poll handler and avoid rescheduling the queue if true, otherwise we risk locking up the host. The rescheduling will be done in the timer callback function. Reported-by: Jean-Louis Dupond Signed-off-by: Wei Liu Tested-by: Jean-Louis Dupond Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 1 + drivers/net/xen-netback/interface.c | 6 +++++- drivers/net/xen-netback/netback.c | 6 +++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 530586be05b4..5b1d2e8402d9 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -199,6 +199,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */ unsigned long remaining_credit; struct timer_list credit_timeout; u64 credit_window_start; + bool rate_limited; /* Statistics */ struct xenvif_stats stats; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 8397f6c92451..e322a862ddfe 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -106,7 +106,11 @@ static int xenvif_poll(struct napi_struct *napi, int budget) if (work_done < budget) { napi_complete_done(napi, work_done); - xenvif_napi_schedule_or_enable_events(queue); + /* If the queue is rate-limited, it shall be + * rescheduled in the timer callback. + */ + if (likely(!queue->rate_limited)) + xenvif_napi_schedule_or_enable_events(queue); } return work_done; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 602d408fa25e..5042ff8d449a 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -180,6 +180,7 @@ static void tx_add_credit(struct xenvif_queue *queue) max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ queue->remaining_credit = min(max_credit, max_burst); + queue->rate_limited = false; } void xenvif_tx_credit_callback(unsigned long data) @@ -686,8 +687,10 @@ static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size) msecs_to_jiffies(queue->credit_usec / 1000); /* Timer could already be pending in rare cases. */ - if (timer_pending(&queue->credit_timeout)) + if (timer_pending(&queue->credit_timeout)) { + queue->rate_limited = true; return true; + } /* Passed the point where we can replenish credit? */ if (time_after_eq64(now, next_credit)) { @@ -702,6 +705,7 @@ static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size) mod_timer(&queue->credit_timeout, next_credit); queue->credit_window_start = next_credit; + queue->rate_limited = true; return true; } From e26f43faa0d79dd06e9e94829696b68b9940c2ee Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 21 Jun 2017 07:59:16 -0400 Subject: [PATCH 091/149] macvlan: Do not return error when setting the same mac address The user currently gets an EBUSY error when attempting to set the mac address on a macvlan device to the same value. This should really be a no-op as nothing changes. Catch the condition and return early. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 67bf7ebae5c6..de214fbda173 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -703,6 +703,10 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p) if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; + /* If the addresses are the same, this is a no-op */ + if (ether_addr_equal(dev->dev_addr, addr->sa_data)) + return 0; + if (vlan->mode == MACVLAN_MODE_PASSTHRU) { dev_set_mac_address(vlan->lowerdev, addr); return 0; From e696cda7bd091411705a4e868ce480eaa3082dbf Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 21 Jun 2017 07:59:17 -0400 Subject: [PATCH 092/149] macvlan: Fix passthru macvlan mac address inheritance When a lower device of the passthru macvlan changes it's address, passthru macvlan is supposed to change it's own address as well. However, that doesn't happen correctly because the check in macvlan_addr_busy() will catch the fact that the lower level (port) mac address is the same as the address we are trying to assign to the macvlan, and return an error. As a reasult, the address of the passthru macvlan device is never changed. The same thing happens when the user attempts to change the mac address of the passthru macvlan. The simple solution appers to be to not check against the lower device in case of passthru macvlan device, since the 2 addresses are _supposed_ to be the same. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index de214fbda173..0c9c6da3427a 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -185,7 +185,8 @@ static bool macvlan_addr_busy(const struct macvlan_port *port, * currently in use by the underlying device or * another macvlan. */ - if (ether_addr_equal_64bits(port->dev->dev_addr, addr)) + if (!port->passthru && + ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) From 43c2d578a0ce0d6067a02b46461811aced551425 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 21 Jun 2017 07:59:18 -0400 Subject: [PATCH 093/149] macvlan: convert port passthru to flags. Convert the port passthru boolean into flags with accesor functions. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 0c9c6da3427a..4d013ca79dae 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -39,13 +39,15 @@ #define MACVLAN_HASH_SIZE (1<flags & MACVLAN_F_PASSTHRU; +} + +static inline void macvlan_set_passthru(struct macvlan_port *port) +{ + port->flags |= MACVLAN_F_PASSTHRU; +} + /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { @@ -185,7 +197,7 @@ static bool macvlan_addr_busy(const struct macvlan_port *port, * currently in use by the underlying device or * another macvlan. */ - if (!port->passthru && + if (!macvlan_passthru(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; @@ -446,7 +458,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) } macvlan_forward_source(skb, port, eth->h_source); - if (port->passthru) + if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); else @@ -575,7 +587,7 @@ static int macvlan_open(struct net_device *dev) struct net_device *lowerdev = vlan->lowerdev; int err; - if (vlan->port->passthru) { + if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) @@ -650,7 +662,7 @@ static int macvlan_stop(struct net_device *dev) dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); - if (vlan->port->passthru) { + if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) dev_set_promiscuity(lowerdev, -1); goto hash_del; @@ -683,7 +695,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) if (macvlan_addr_busy(vlan->port, addr)) return -EBUSY; - if (!vlan->port->passthru) { + if (!macvlan_passthru(vlan->port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; @@ -933,7 +945,7 @@ static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ - if (!vlan->port->passthru && is_unicast_ether_addr(addr)) + if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (flags & NLM_F_REPLACE) @@ -957,7 +969,7 @@ static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ - if (!vlan->port->passthru && is_unicast_ether_addr(addr)) + if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) @@ -1125,7 +1137,6 @@ static int macvlan_port_create(struct net_device *dev) if (port == NULL) return -ENOMEM; - port->passthru = false; port->dev = dev; INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) @@ -1331,7 +1342,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ - if (port->passthru) { + if (macvlan_passthru(port)) { /* The macvlan port must be not created this time, * still goto destroy_macvlan_port for readability. */ @@ -1357,7 +1368,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, err = -EINVAL; goto destroy_macvlan_port; } - port->passthru = true; + macvlan_set_passthru(port); eth_hw_addr_inherit(dev, lowerdev); } @@ -1439,7 +1450,7 @@ static int macvlan_changelink(struct net_device *dev, if (data && data[IFLA_MACVLAN_FLAGS]) { __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC; - if (vlan->port->passthru && promisc) { + if (macvlan_passthru(vlan->port) && promisc) { int err; if (flags & MACVLAN_FLAG_NOPROMISC) @@ -1602,7 +1613,7 @@ static int macvlan_device_event(struct notifier_block *unused, } break; case NETDEV_CHANGEADDR: - if (!port->passthru) + if (!macvlan_passthru(port)) return NOTIFY_DONE; vlan = list_first_entry_or_null(&port->vlans, From 18c8c54de9a619ba5533419e0170433e20c0ee3e Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 21 Jun 2017 07:59:19 -0400 Subject: [PATCH 094/149] macvlan: Let passthru macvlan correctly restore lower mac address Passthru macvlans directly change the mac address of the lower level device. That's OK, but after the macvlan is deleted, the lower device is left with changed address and one needs to reboot to bring back the origina HW addresses. This scenario is actually quite common with passthru macvtap devices. This patch attempts to solve this, by storing the mac address of the lower device in macvlan_port structure and keeping track of it through the changes. After this patch, any changes to the lower device mac address done trough the macvlan device, will be reverted back. Any changs done directly to the lower device mac address will be kept. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 47 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 4d013ca79dae..72b801803aa4 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -40,6 +40,7 @@ #define MACVLAN_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 +#define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; @@ -51,6 +52,7 @@ struct macvlan_port { int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); + unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { @@ -78,6 +80,21 @@ static inline void macvlan_set_passthru(struct macvlan_port *port) port->flags |= MACVLAN_F_PASSTHRU; } +static inline bool macvlan_addr_change(const struct macvlan_port *port) +{ + return port->flags & MACVLAN_F_ADDRCHANGE; +} + +static inline void macvlan_set_addr_change(struct macvlan_port *port) +{ + port->flags |= MACVLAN_F_ADDRCHANGE; +} + +static inline void macvlan_clear_addr_change(struct macvlan_port *port) +{ + port->flags &= ~MACVLAN_F_ADDRCHANGE; +} + /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { @@ -193,11 +210,11 @@ static void macvlan_hash_change_addr(struct macvlan_dev *vlan, static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { - /* Test to see if the specified multicast address is + /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ - if (!macvlan_passthru(port) && + if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; @@ -685,6 +702,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; + struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { @@ -695,7 +713,7 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) if (macvlan_addr_busy(vlan->port, addr)) return -EBUSY; - if (!macvlan_passthru(vlan->port)) { + if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; @@ -705,6 +723,15 @@ static int macvlan_sync_address(struct net_device *dev, unsigned char *addr) macvlan_hash_change_addr(vlan, addr); } + if (macvlan_passthru(port) && !macvlan_addr_change(port)) { + /* Since addr_change isn't set, we are here due to lower + * device change. Save the lower-dev address so we can + * restore it later. + */ + ether_addr_copy(vlan->port->perm_addr, + lowerdev->dev_addr); + } + macvlan_clear_addr_change(port); return 0; } @@ -721,6 +748,7 @@ static int macvlan_set_mac_address(struct net_device *dev, void *p) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { + macvlan_set_addr_change(vlan->port); dev_set_mac_address(vlan->lowerdev, addr); return 0; } @@ -1138,6 +1166,7 @@ static int macvlan_port_create(struct net_device *dev) return -ENOMEM; port->dev = dev; + ether_addr_copy(port->perm_addr, dev->dev_addr); INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); @@ -1177,6 +1206,18 @@ static void macvlan_port_destroy(struct net_device *dev) kfree_skb(skb); } + /* If the lower device address has been changed by passthru + * macvlan, put it back. + */ + if (macvlan_passthru(port) && + !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { + struct sockaddr sa; + + sa.sa_family = port->dev->type; + memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len); + dev_set_mac_address(port->dev, &sa); + } + kfree(port); } From 60abc0be96e00ca71bac083215ac91ad2e575096 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 21 Jun 2017 14:34:58 -0700 Subject: [PATCH 095/149] ipv6: avoid unregistering inet6_dev for loopback The per netns loopback_dev->ip6_ptr is unregistered and set to NULL when its mtu is set to smaller than IPV6_MIN_MTU, this leads to that we could set rt->rt6i_idev NULL after a rt6_uncached_list_flush_dev() and then crash after another call. In this case we should just bring its inet6_dev down, rather than unregistering it, at least prior to commit 176c39af29bc ("netns: fix addrconf_ifdown kernel panic") we always override the case for loopback. Thanks a lot to Andrey for finding a reliable reproducer. Fixes: 176c39af29bc ("netns: fix addrconf_ifdown kernel panic") Reported-by: Andrey Konovalov Cc: Andrey Konovalov Cc: Daniel Lezcano Cc: David Ahern Signed-off-by: Cong Wang Acked-by: David Ahern Tested-by: Andrey Konovalov Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 686c92375e81..1d2dbace42ff 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3369,6 +3369,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *info; struct inet6_dev *idev = __in6_dev_get(dev); + struct net *net = dev_net(dev); int run_pending = 0; int err; @@ -3384,7 +3385,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, case NETDEV_CHANGEMTU: /* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */ if (dev->mtu < IPV6_MIN_MTU) { - addrconf_ifdown(dev, 1); + addrconf_ifdown(dev, dev != net->loopback_dev); break; } @@ -3500,7 +3501,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, * IPV6_MIN_MTU stop IPv6 on this interface. */ if (dev->mtu < IPV6_MIN_MTU) - addrconf_ifdown(dev, 1); + addrconf_ifdown(dev, dev != net->loopback_dev); } break; From b92b7d3312033a08cae2c879b9243c42ad7de94b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Jun 2017 00:16:37 +0200 Subject: [PATCH 096/149] netvsc: don't access netdev->num_rx_queues directly This structure member is hidden behind CONFIG_SYSFS, and we get a build error when that is disabled: drivers/net/hyperv/netvsc_drv.c: In function 'netvsc_set_channels': drivers/net/hyperv/netvsc_drv.c:754:49: error: 'struct net_device' has no member named 'num_rx_queues'; did you mean 'num_tx_queues'? drivers/net/hyperv/netvsc_drv.c: In function 'netvsc_set_rxfh': drivers/net/hyperv/netvsc_drv.c:1181:25: error: 'struct net_device' has no member named 'num_rx_queues'; did you mean 'num_tx_queues'? As the value is only set once to the argument of alloc_netdev_mq(), we can compare against that constant directly. Fixes: ff4a44199012 ("netvsc: allow get/set of RSS indirection table") Fixes: 2b01888d1b45 ("netvsc: allow more flexible setting of number of channels") Signed-off-by: Arnd Bergmann Reviewed-by: Haiyang Zhang Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 82d6c022ca85..643c539a08ba 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -776,7 +776,7 @@ static int netvsc_set_channels(struct net_device *net, channels->rx_count || channels->tx_count || channels->other_count) return -EINVAL; - if (count > net->num_tx_queues || count > net->num_rx_queues) + if (count > net->num_tx_queues || count > VRSS_CHANNEL_MAX) return -EINVAL; if (!nvdev || nvdev->destroy) @@ -1203,7 +1203,7 @@ static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir, rndis_dev = ndev->extension; if (indir) { for (i = 0; i < ITAB_NUM; i++) - if (indir[i] >= dev->num_rx_queues) + if (indir[i] >= VRSS_CHANNEL_MAX) return -EINVAL; for (i = 0; i < ITAB_NUM; i++) From bb53f4d4f5116d3dae76bb12fb16bc73771f958a Mon Sep 17 00:00:00 2001 From: Martin Habets Date: Thu, 22 Jun 2017 10:50:41 +0100 Subject: [PATCH 097/149] sfc: Fix MCDI command size for filter operations The 8000 series adapters uses catch-all filters for encapsulated traffic to support filtering VXLAN, NVGRE and GENEVE traffic. This new filter functionality requires a longer MCDI command. This patch increases the size of buffers on stack that were missed, which fixes a kernel panic from the stack protector. Fixes: 9b41080125176 ("sfc: insert catch-all filters for encapsulated traffic") Signed-off-by: Martin Habets Acked-by: Edward Cree Acked-by: Bert Kenward bkenward@solarflare.com Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 78efb2822b86..a8089667fc5b 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -4172,7 +4172,7 @@ found: * recipients */ if (is_mc_recip) { - MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_IN_LEN); + MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_EXT_IN_LEN); unsigned int depth, i; memset(inbuf, 0, sizeof(inbuf)); @@ -4320,7 +4320,7 @@ static int efx_ef10_filter_remove_internal(struct efx_nic *efx, efx_ef10_filter_set_entry(table, filter_idx, NULL, 0); } else { efx_mcdi_display_error(efx, MC_CMD_FILTER_OP, - MC_CMD_FILTER_OP_IN_LEN, + MC_CMD_FILTER_OP_EXT_IN_LEN, NULL, 0, rc); } } @@ -4453,7 +4453,7 @@ static s32 efx_ef10_filter_rfs_insert(struct efx_nic *efx, struct efx_filter_spec *spec) { struct efx_ef10_filter_table *table = efx->filter_state; - MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_IN_LEN); + MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_EXT_IN_LEN); struct efx_filter_spec *saved_spec; unsigned int hash, i, depth = 1; bool replacing = false; @@ -4940,7 +4940,7 @@ not_restored: static void efx_ef10_filter_table_remove(struct efx_nic *efx) { struct efx_ef10_filter_table *table = efx->filter_state; - MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_IN_LEN); + MCDI_DECLARE_BUF(inbuf, MC_CMD_FILTER_OP_EXT_IN_LEN); struct efx_filter_spec *spec; unsigned int filter_idx; int rc; From 7598f8bc1383ffd77686cb4e92e749bef3c75937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 21 Jun 2017 18:41:34 +0200 Subject: [PATCH 098/149] perf probe: Fix probe definition for inlined functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 613f050d68a8 ("perf probe: Fix to probe on gcc generated functions in modules"), the offset from symbol is, incorrectly, added to the trace point address. This leads to incorrect probe trace points for inlined functions and when using relative line number on symbols. Prior this patch: $ perf probe -m nf_nat -D in_range p:probe/in_range nf_nat:in_range.isra.9+0 $ perf probe -m i40e -D i40e_clean_rx_irq p:probe/i40e_clean_rx_irq i40e:i40e_napi_poll+2212 $ perf probe -m i40e -D i40e_clean_rx_irq:16 p:probe/i40e_clean_rx_irq i40e:i40e_lan_xmit_frame+626 After: $ perf probe -m nf_nat -D in_range p:probe/in_range nf_nat:in_range.isra.9+0 $ perf probe -m i40e -D i40e_clean_rx_irq p:probe/i40e_clean_rx_irq i40e:i40e_napi_poll+1106 $ perf probe -m i40e -D i40e_clean_rx_irq:16 p:probe/i40e_clean_rx_irq i40e:i40e_napi_poll+2665 Committer testing: Using 'pfunct', a tool found in the 'dwarves' package [1], one can ask what are the functions that while not being explicitely marked as inline, were inlined by the compiler: # pfunct --cc_inlined /lib/modules/4.12.0-rc4+/kernel/drivers/net/ethernet/intel/e1000e/e1000e.ko | head __ew32 e1000_regdump e1000e_dump_ps_pages e1000_desc_unused e1000e_systim_to_hwtstamp e1000e_rx_hwtstamp e1000e_update_rdt_wa e1000e_update_tdt_wa e1000_put_txbuf e1000_consume_page Then ask 'perf probe' to produce the kprobe_tracer probe definitions for two of them: # perf probe -m e1000e -D e1000e_rx_hwtstamp p:probe/e1000e_rx_hwtstamp e1000e:e1000_receive_skb+74 # perf probe -m e1000e -D e1000_consume_page p:probe/e1000_consume_page e1000e:e1000_clean_jumbo_rx_irq+876 p:probe/e1000_consume_page_1 e1000e:e1000_clean_jumbo_rx_irq+1506 p:probe/e1000_consume_page_2 e1000e:e1000_clean_rx_irq_ps+1074 Now lets concentrate on the 'e1000_consume_page' one, that was inlined twice in e1000_clean_jumbo_rx_irq(), lets see what readelf says about the DWARF tags for that function: $ readelf -wi /lib/modules/4.12.0-rc4+/kernel/drivers/net/ethernet/intel/e1000e/e1000e.ko <1><13e27b>: Abbrev Number: 121 (DW_TAG_subprogram) <13e27c> DW_AT_name : (indirect string, offset: 0xa8945): e1000_clean_jumbo_rx_irq <13e287> DW_AT_low_pc : 0x17a30 <3><13e6ef>: Abbrev Number: 119 (DW_TAG_inlined_subroutine) <13e6f0> DW_AT_abstract_origin: <0x13ed2c> <13e6f4> DW_AT_low_pc : 0x17be6 <1><13ed2c>: Abbrev Number: 142 (DW_TAG_subprogram) <13ed2e> DW_AT_name : (indirect string, offset: 0xa54c3): e1000_consume_page So, the first time in e1000_clean_jumbo_rx_irq() where e1000_consume_page() is inlined is at PC 0x17be6, which subtracted from e1000_clean_jumbo_rx_irq()'s address, gives us the offset we should use in the probe definition: 0x17be6 - 0x17a30 = 438 but above we have 876, which is twice as much. Lets see the second inline expansion of e1000_consume_page() in e1000_clean_jumbo_rx_irq(): <3><13e86e>: Abbrev Number: 119 (DW_TAG_inlined_subroutine) <13e86f> DW_AT_abstract_origin: <0x13ed2c> <13e873> DW_AT_low_pc : 0x17d21 0x17d21 - 0x17a30 = 753 So we where adding it at twice the offset from the containing function as we should. And then after this patch: # perf probe -m e1000e -D e1000e_rx_hwtstamp p:probe/e1000e_rx_hwtstamp e1000e:e1000_receive_skb+37 # perf probe -m e1000e -D e1000_consume_page p:probe/e1000_consume_page e1000e:e1000_clean_jumbo_rx_irq+438 p:probe/e1000_consume_page_1 e1000e:e1000_clean_jumbo_rx_irq+753 p:probe/e1000_consume_page_2 e1000e:e1000_clean_jumbo_rx_irq+1353 # Which matches the two first expansions and shows that because we were doubling the offset it would spill over the next function: readelf -sw /lib/modules/4.12.0-rc4+/kernel/drivers/net/ethernet/intel/e1000e/e1000e.ko 673: 0000000000017a30 1626 FUNC LOCAL DEFAULT 2 e1000_clean_jumbo_rx_irq 674: 0000000000018090 2013 FUNC LOCAL DEFAULT 2 e1000_clean_rx_irq_ps This is the 3rd inline expansion of e1000_consume_page() in e1000_clean_jumbo_rx_irq(): <3><13ec77>: Abbrev Number: 119 (DW_TAG_inlined_subroutine) <13ec78> DW_AT_abstract_origin: <0x13ed2c> <13ec7c> DW_AT_low_pc : 0x17f79 0x17f79 - 0x17a30 = 1353 So: 0x17a30 + 2 * 1353 = 0x184c2 And: 0x184c2 - 0x18090 = 1074 Which explains the bogus third expansion for e1000_consume_page() to end up at: p:probe/e1000_consume_page_2 e1000e:e1000_clean_rx_irq_ps+1074 All fixed now :-) [1] https://git.kernel.org/pub/scm/devel/pahole/pahole.git/ Signed-off-by: Björn Töpel Tested-by: Arnaldo Carvalho de Melo Acked-by: Magnus Karlsson Acked-by: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 613f050d68a8 ("perf probe: Fix to probe on gcc generated functions in modules") Link: http://lkml.kernel.org/r/20170621164134.5701-1-bjorn.topel@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 84e7e698411e..a2670e9d652d 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -619,7 +619,7 @@ static int post_process_probe_trace_point(struct probe_trace_point *tp, struct map *map, unsigned long offs) { struct symbol *sym; - u64 addr = tp->address + tp->offset - offs; + u64 addr = tp->address - offs; sym = map__find_symbol(map, addr); if (!sym) From ad8181060788c80c0ad75b583f24c22fa962a7a6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 22 May 2017 18:44:57 -0700 Subject: [PATCH 099/149] kconfig: fix sparse warnings in nconfig Fix sparse warnings in scripts/kconfig/nconf* ('make nconfig'): ../scripts/kconfig/nconf.c:1071:32: warning: Using plain integer as NULL pointer ../scripts/kconfig/nconf.c:1238:30: warning: Using plain integer as NULL pointer ../scripts/kconfig/nconf.c:511:51: warning: Using plain integer as NULL pointer ../scripts/kconfig/nconf.c:1460:6: warning: symbol 'setup_windows' was not declared. Should it be static? ../scripts/kconfig/nconf.c:274:12: warning: symbol 'current_instructions' was not declared. Should it be static? ../scripts/kconfig/nconf.c:308:22: warning: symbol 'function_keys' was not declared. Should it be static? ../scripts/kconfig/nconf.gui.c:132:17: warning: non-ANSI function declaration of function 'set_colors' ../scripts/kconfig/nconf.gui.c:195:24: warning: Using plain integer as NULL pointer nconf.gui.o before/after files are the same. nconf.o before/after files are the same until the 'static' function declarations are added. Signed-off-by: Randy Dunlap Signed-off-by: Masahiro Yamada --- scripts/kconfig/nconf.c | 12 ++++++------ scripts/kconfig/nconf.gui.c | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/kconfig/nconf.c b/scripts/kconfig/nconf.c index a9bc5334a478..003114779815 100644 --- a/scripts/kconfig/nconf.c +++ b/scripts/kconfig/nconf.c @@ -271,7 +271,7 @@ static struct mitem k_menu_items[MAX_MENU_ITEMS]; static int items_num; static int global_exit; /* the currently selected button */ -const char *current_instructions = menu_instructions; +static const char *current_instructions = menu_instructions; static char *dialog_input_result; static int dialog_input_result_len; @@ -305,7 +305,7 @@ struct function_keys { }; static const int function_keys_num = 9; -struct function_keys function_keys[] = { +static struct function_keys function_keys[] = { { .key_str = "F1", .func = "Help", @@ -508,7 +508,7 @@ static int get_mext_match(const char *match_str, match_f flag) index = (index + items_num) % items_num; while (true) { char *str = k_menu_items[index].str; - if (strcasestr(str, match_str) != 0) + if (strcasestr(str, match_str) != NULL) return index; if (flag == FIND_NEXT_MATCH_UP || flag == MATCH_TINKER_PATTERN_UP) @@ -1067,7 +1067,7 @@ static int do_match(int key, struct match_state *state, int *ans) static void conf(struct menu *menu) { - struct menu *submenu = 0; + struct menu *submenu = NULL; const char *prompt = menu_get_prompt(menu); struct symbol *sym; int res; @@ -1234,7 +1234,7 @@ static void show_help(struct menu *menu) static void conf_choice(struct menu *menu) { const char *prompt = _(menu_get_prompt(menu)); - struct menu *child = 0; + struct menu *child = NULL; struct symbol *active; int selected_index = 0; int last_top_row = 0; @@ -1456,7 +1456,7 @@ static void conf_save(void) } } -void setup_windows(void) +static void setup_windows(void) { int lines, columns; diff --git a/scripts/kconfig/nconf.gui.c b/scripts/kconfig/nconf.gui.c index 4b2f44c20caf..a64b1c31253e 100644 --- a/scripts/kconfig/nconf.gui.c +++ b/scripts/kconfig/nconf.gui.c @@ -129,7 +129,7 @@ static void no_colors_theme(void) mkattrn(FUNCTION_TEXT, A_REVERSE); } -void set_colors() +void set_colors(void) { start_color(); use_default_colors(); @@ -192,7 +192,7 @@ const char *get_line(const char *text, int line_no) int lines = 0; if (!text) - return 0; + return NULL; for (i = 0; text[i] != '\0' && lines < line_no; i++) if (text[i] == '\n') From 34f19ff1b5a0d11e46df479623d6936460105c9f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Jun 2017 15:58:29 +1000 Subject: [PATCH 100/149] powerpc/64: Initialise thread_info for emergency stacks Emergency stacks have their thread_info mostly uninitialised, which in particular means garbage preempt_count values. Emergency stack code runs with interrupts disabled entirely, and is used very rarely, so this has been unnoticed so far. It was found by a proposed new powerpc watchdog that takes a soft-NMI directly from the masked_interrupt handler and using the emergency stack. That crashed at BUG_ON(in_nmi()) in nmi_enter(). preempt_count()s were found to be garbage. To fix this, zero the entire THREAD_SIZE allocation, and initialize the thread_info. Cc: stable@vger.kernel.org Reported-by: Abdul Haleem Signed-off-by: Nicholas Piggin [mpe: Move it all into setup_64.c, use a function not a macro. Fix crashes on Cell by setting preempt_count to 0 not HARDIRQ_OFFSET] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a8c1f99e9607..4640f6d64f8b 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -615,6 +615,24 @@ void __init exc_lvl_early_init(void) } #endif +/* + * Emergency stacks are used for a range of things, from asynchronous + * NMIs (system reset, machine check) to synchronous, process context. + * We set preempt_count to zero, even though that isn't necessarily correct. To + * get the right value we'd need to copy it from the previous thread_info, but + * doing that might fault causing more problems. + * TODO: what to do with accounting? + */ +static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu) +{ + ti->task = NULL; + ti->cpu = cpu; + ti->preempt_count = 0; + ti->local_flags = 0; + ti->flags = 0; + klp_init_thread_info(ti); +} + /* * Stack space used when we detect a bad kernel stack pointer, and * early in SMP boots before relocation is enabled. Exclusive emergency @@ -633,24 +651,31 @@ void __init emergency_stack_init(void) * Since we use these as temporary stacks during secondary CPU * bringup, we need to get at them in real mode. This means they * must also be within the RMO region. + * + * The IRQ stacks allocated elsewhere in this file are zeroed and + * initialized in kernel/irq.c. These are initialized here in order + * to have emergency stacks available as early as possible. */ limit = min(safe_stack_limit(), ppc64_rma_size); for_each_possible_cpu(i) { struct thread_info *ti; ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); - klp_init_thread_info(ti); + memset(ti, 0, THREAD_SIZE); + emerg_stack_init_thread_info(ti, i); paca[i].emergency_sp = (void *)ti + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); - klp_init_thread_info(ti); + memset(ti, 0, THREAD_SIZE); + emerg_stack_init_thread_info(ti, i); paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE; /* emergency stack for machine check exception handling. */ ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); - klp_init_thread_info(ti); + memset(ti, 0, THREAD_SIZE); + emerg_stack_init_thread_info(ti, i); paca[i].mc_emergency_sp = (void *)ti + THREAD_SIZE; #endif } From 9768935264c4f0e4afd788a185d8e8d89c28e41d Mon Sep 17 00:00:00 2001 From: Andrew Duggan Date: Fri, 23 Jun 2017 00:04:51 -0700 Subject: [PATCH 101/149] Input: synaptics-rmi4 - only read the F54 query registers which are used The F54 driver is currently only using the first 6 bytes of F54 so there is no need to read all 27 bytes. Some Dell systems (Dell XP13 9333 and similar) have an issue with the touchpad or I2C bus when reading reports larger then 16 bytes. Reads larger then 16 bytes are reported in two HID reports. Something about the back to back reports seems to cause the next read to report incorrect data. This results in F30 failing to load and the click button failing to work. Previous issues with the I2C controller or touchpad were addressed in: commit 5b65c2a02966 ("HID: rmi: check sanity of the incoming report") Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=195949 Signed-off-by: Andrew Duggan Reviewed-by: Benjamin Tissoires Reviewed-by: Nick Dyer Signed-off-by: Dmitry Torokhov --- drivers/input/rmi4/rmi_f54.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/input/rmi4/rmi_f54.c b/drivers/input/rmi4/rmi_f54.c index dea63e2db3e6..f5206e2c767e 100644 --- a/drivers/input/rmi4/rmi_f54.c +++ b/drivers/input/rmi4/rmi_f54.c @@ -31,9 +31,6 @@ #define F54_GET_REPORT 1 #define F54_FORCE_CAL 2 -/* Fixed sizes of reports */ -#define F54_QUERY_LEN 27 - /* F54 capabilities */ #define F54_CAP_BASELINE (1 << 2) #define F54_CAP_IMAGE8 (1 << 3) @@ -95,7 +92,6 @@ struct rmi_f54_reports { struct f54_data { struct rmi_function *fn; - u8 qry[F54_QUERY_LEN]; u8 num_rx_electrodes; u8 num_tx_electrodes; u8 capabilities; @@ -632,22 +628,23 @@ static int rmi_f54_detect(struct rmi_function *fn) { int error; struct f54_data *f54; + u8 buf[6]; f54 = dev_get_drvdata(&fn->dev); error = rmi_read_block(fn->rmi_dev, fn->fd.query_base_addr, - &f54->qry, sizeof(f54->qry)); + buf, sizeof(buf)); if (error) { dev_err(&fn->dev, "%s: Failed to query F54 properties\n", __func__); return error; } - f54->num_rx_electrodes = f54->qry[0]; - f54->num_tx_electrodes = f54->qry[1]; - f54->capabilities = f54->qry[2]; - f54->clock_rate = f54->qry[3] | (f54->qry[4] << 8); - f54->family = f54->qry[5]; + f54->num_rx_electrodes = buf[0]; + f54->num_tx_electrodes = buf[1]; + f54->capabilities = buf[2]; + f54->clock_rate = buf[3] | (buf[4] << 8); + f54->family = buf[5]; rmi_dbg(RMI_DEBUG_FN, &fn->dev, "F54 num_rx_electrodes: %d\n", f54->num_rx_electrodes); From c4d097d13052d1e6f29b8798264aed6135d99568 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Fri, 23 Jun 2017 17:27:01 +0200 Subject: [PATCH 102/149] dm raid: fix oops on upgrading to extended superblock format When a RAID set was created on dm-raid version < 1.9.0 (old RAID superblock format), all of the new 1.9.0 members of the superblock are uninitialized (zero) -- including the device sectors member needed to support shrinking. All the other accesses to superblock fields new in 1.9.0 were reviewed and verified to be properly guarded against invalid use. The 'sectors' member was the only one used when the superblock version is < 1.9. Don't access the superblock's >= 1.9.0 'sectors' member unconditionally. Also add respective comments. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 7d893228c40f..b4b75dad816a 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1927,7 +1927,7 @@ struct dm_raid_superblock { /******************************************************************** * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!! * - * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist + * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist */ __le32 flags; /* Flags defining array states for reshaping */ @@ -2092,6 +2092,11 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) sb->layout = cpu_to_le32(mddev->layout); sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); + /******************************************************************** + * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!! + * + * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist + */ sb->new_level = cpu_to_le32(mddev->new_level); sb->new_layout = cpu_to_le32(mddev->new_layout); sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors); @@ -2438,8 +2443,14 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; if (!test_and_clear_bit(FirstUse, &rdev->flags)) { - /* Retrieve device size stored in superblock to be prepared for shrink */ - rdev->sectors = le64_to_cpu(sb->sectors); + /* + * Retrieve rdev size stored in superblock to be prepared for shrink. + * Check extended superblock members are present otherwise the size + * will not be set! + */ + if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) + rdev->sectors = le64_to_cpu(sb->sectors); + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); if (rdev->recovery_offset == MaxSector) set_bit(In_sync, &rdev->flags); From a5cb659bbc1c8644efa0c3138a757a1e432a4880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Mon, 19 Jun 2017 13:03:43 +0200 Subject: [PATCH 103/149] net: account for current skb length when deciding about UFO Our customer encountered stuck NFS writes for blocks starting at specific offsets w.r.t. page boundary caused by networking stack sending packets via UFO enabled device with wrong checksum. The problem can be reproduced by composing a long UDP datagram from multiple parts using MSG_MORE flag: sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 3000, 0, ...); Assume this packet is to be routed via a device with MTU 1500 and NETIF_F_UFO enabled. When second sendto() gets into __ip_append_data(), this condition is tested (among others) to decide whether to call ip_ufo_append_data(): ((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb)) At the moment, we already have skb with 1028 bytes of data which is not marked for GSO so that the test is false (fragheaderlen is usually 20). Thus we append second 1000 bytes to this skb without invoking UFO. Third sendto(), however, has sufficient length to trigger the UFO path so that we end up with non-UFO skb followed by a UFO one. Later on, udp_send_skb() uses udp_csum() to calculate the checksum but that assumes all fragments have correct checksum in skb->csum which is not true for UFO fragments. When checking against MTU, we need to add skb->len to length of new segment if we already have a partially filled skb and fragheaderlen only if there isn't one. In the IPv6 case, skb can only be null if this is the first segment so that we have to use headersize (length of the first IPv6 header) rather than fragheaderlen (length of IPv6 header of further fragments) for skb == NULL. Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach") Fixes: e4c5e13aa45c ("ipv6: Should use consistent conditional judgement for ip6 fragment between __ip6_append_data and ip6_finish_output") Signed-off-by: Michal Kubecek Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- net/ipv6/ip6_output.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7a3fd25e8913..532b36e9ce2a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -964,7 +964,8 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && + if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) || + (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index bf8a58a1c32d..1699acb2fa2c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1390,7 +1390,7 @@ emsgsize: */ cork->length += length; - if ((((length + fragheaderlen) > mtu) || + if ((((length + (skb ? skb->len : headersize)) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && From 0ccc22f425e56c4ede9c66f1945846de8ac1f352 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 22 Jun 2017 15:29:33 -0700 Subject: [PATCH 104/149] sit: use __GFP_NOWARN for user controlled allocation The memory allocation size is controlled by user-space, if it is too large just fail silently and return NULL, not to mention there is a fallback allocation later. Reported-by: Andrey Konovalov Signed-off-by: Cong Wang Tested-by: Andrey Konovalov Signed-off-by: David S. Miller --- net/ipv6/sit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2378503577b0..f8ad15891cd7 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -305,7 +305,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t, * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? - kcalloc(cmax, sizeof(*kp), GFP_KERNEL) : + kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) : NULL; rcu_read_lock(); From db9d8b29d19d2801793e4419f4c6272bf8951c62 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 23 Jun 2017 17:51:31 +0200 Subject: [PATCH 105/149] net: dp83640: Avoid NULL pointer dereference. The function, skb_complete_tx_timestamp(), used to allow passing in a NULL pointer for the time stamps, but that was changed in commit 62bccb8cdb69051b95a55ab0c489e3cab261c8ef ("net-timestamp: Make the clone operation stand-alone from phy timestamping"), and the existing call sites, all of which are in the dp83640 driver, were fixed up. Even though the kernel-doc was subsequently updated in commit 7a76a021cd5a292be875fbc616daf03eab1e6996 ("net-timestamp: Update skb_complete_tx_timestamp comment"), still a bug fix from Manfred Rudigier came into the driver using the old semantics. Probably Manfred derived that patch from an older kernel version. This fix should be applied to the stable trees as well. Fixes: 81e8f2e930fe ("net: dp83640: Fix tx timestamp overflow handling.") Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/phy/dp83640.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index ed0d10f54f26..c3065236ffcc 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -908,7 +908,7 @@ static void decode_txts(struct dp83640_private *dp83640, if (overflow) { pr_debug("tx timestamp queue overflow, count %d\n", overflow); while (skb) { - skb_complete_tx_timestamp(skb, NULL); + kfree_skb(skb); skb = skb_dequeue(&dp83640->tx_queue); } return; From 69c149e2e39e8d66437c9034bb4926ef2c1f7c23 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 23 Jun 2017 14:01:00 -0400 Subject: [PATCH 106/149] bnxt_en: Add missing logic to handle TPA end error conditions. When we get a TPA_END completion to handle a completed LRO packet, it is possible that hardware would indicate errors. The current code is not checking for the error condition. Define the proper error bits and the macro to check for this error and abort properly. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 ++++--- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 6 +++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 03f55daecb20..f5ba8ec1b67f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1301,10 +1301,11 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, cp_cons = NEXT_CMP(cp_cons); } - if (unlikely(agg_bufs > MAX_SKB_FRAGS)) { + if (unlikely(agg_bufs > MAX_SKB_FRAGS || TPA_END_ERRORS(tpa_end1))) { bnxt_abort_tpa(bp, bnapi, cp_cons, agg_bufs); - netdev_warn(bp->dev, "TPA frags %d exceeded MAX_SKB_FRAGS %d\n", - agg_bufs, (int)MAX_SKB_FRAGS); + if (agg_bufs > MAX_SKB_FRAGS) + netdev_warn(bp->dev, "TPA frags %d exceeded MAX_SKB_FRAGS %d\n", + agg_bufs, (int)MAX_SKB_FRAGS); return NULL; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3ef42dbc6327..d46a85041083 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -374,12 +374,16 @@ struct rx_tpa_end_cmp_ext { __le32 rx_tpa_end_cmp_errors_v2; #define RX_TPA_END_CMP_V2 (0x1 << 0) - #define RX_TPA_END_CMP_ERRORS (0x7fff << 1) + #define RX_TPA_END_CMP_ERRORS (0x3 << 1) #define RX_TPA_END_CMPL_ERRORS_SHIFT 1 u32 rx_tpa_end_cmp_start_opaque; }; +#define TPA_END_ERRORS(rx_tpa_end_ext) \ + ((rx_tpa_end_ext)->rx_tpa_end_cmp_errors_v2 & \ + cpu_to_le32(RX_TPA_END_CMP_ERRORS)) + #define DB_IDX_MASK 0xffffff #define DB_IDX_VALID (0x1 << 26) #define DB_IRQ_DIS (0x1 << 27) From 2270bc5da34979454e6f2eb133d800b635156174 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 23 Jun 2017 14:01:01 -0400 Subject: [PATCH 107/149] bnxt_en: Fix netpoll handling. To handle netpoll properly, the driver must only handle TX packets during NAPI. Handling RX events cause warnings and errors in netpoll mode. The ndo_poll_controller() method should call napi_schedule() directly so that a NAPI weight of zero will be used during netpoll mode. The bnxt_en driver supports 2 ring modes: combined, and separate rx/tx. In separate rx/tx mode, the ndo_poll_controller() method will only process the tx rings. In combined mode, the rx and tx completion entries are mixed in the completion ring and we need to drop the rx entries and recycle the rx buffers. Add a function bnxt_force_rx_discard() to handle this in netpoll mode when we see rx entries in combined ring mode. Reported-by: Calvin Owens Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 54 ++++++++++++++++++++--- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index f5ba8ec1b67f..74e8e215524d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1563,6 +1563,45 @@ next_rx_no_prod: return rc; } +/* In netpoll mode, if we are using a combined completion ring, we need to + * discard the rx packets and recycle the buffers. + */ +static int bnxt_force_rx_discard(struct bnxt *bp, struct bnxt_napi *bnapi, + u32 *raw_cons, u8 *event) +{ + struct bnxt_cp_ring_info *cpr = &bnapi->cp_ring; + u32 tmp_raw_cons = *raw_cons; + struct rx_cmp_ext *rxcmp1; + struct rx_cmp *rxcmp; + u16 cp_cons; + u8 cmp_type; + + cp_cons = RING_CMP(tmp_raw_cons); + rxcmp = (struct rx_cmp *) + &cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)]; + + tmp_raw_cons = NEXT_RAW_CMP(tmp_raw_cons); + cp_cons = RING_CMP(tmp_raw_cons); + rxcmp1 = (struct rx_cmp_ext *) + &cpr->cp_desc_ring[CP_RING(cp_cons)][CP_IDX(cp_cons)]; + + if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons)) + return -EBUSY; + + cmp_type = RX_CMP_TYPE(rxcmp); + if (cmp_type == CMP_TYPE_RX_L2_CMP) { + rxcmp1->rx_cmp_cfa_code_errors_v2 |= + cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR); + } else if (cmp_type == CMP_TYPE_RX_L2_TPA_END_CMP) { + struct rx_tpa_end_cmp_ext *tpa_end1; + + tpa_end1 = (struct rx_tpa_end_cmp_ext *)rxcmp1; + tpa_end1->rx_tpa_end_cmp_errors_v2 |= + cpu_to_le32(RX_TPA_END_CMP_ERRORS); + } + return bnxt_rx_pkt(bp, bnapi, raw_cons, event); +} + #define BNXT_GET_EVENT_PORT(data) \ ((data) & \ ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK) @@ -1745,7 +1784,11 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) if (unlikely(tx_pkts > bp->tx_wake_thresh)) rx_pkts = budget; } else if ((TX_CMP_TYPE(txcmp) & 0x30) == 0x10) { - rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event); + if (likely(budget)) + rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event); + else + rc = bnxt_force_rx_discard(bp, bnapi, &raw_cons, + &event); if (likely(rc >= 0)) rx_pkts += rc; else if (rc == -EBUSY) /* partial completion */ @@ -6664,12 +6707,11 @@ static void bnxt_poll_controller(struct net_device *dev) struct bnxt *bp = netdev_priv(dev); int i; - for (i = 0; i < bp->cp_nr_rings; i++) { - struct bnxt_irq *irq = &bp->irq_tbl[i]; + /* Only process tx rings/combined rings in netpoll mode. */ + for (i = 0; i < bp->tx_nr_rings; i++) { + struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; - disable_irq(irq->vector); - irq->handler(irq->vector, bp->bnapi[i]); - enable_irq(irq->vector); + napi_schedule(&txr->bnapi->napi); } } #endif From c891d9f6bf2a78c9c657656872a60807820db4c8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 23 Jun 2017 15:08:38 -0700 Subject: [PATCH 108/149] mm, thp: remove cond_resched from __collapse_huge_page_copy This is a partial revert of commit 338a16ba1549 ("mm, thp: copying user pages must schedule on collapse") which added a cond_resched() to __collapse_huge_page_copy(). On x86 with CONFIG_HIGHPTE, __collapse_huge_page_copy is called in atomic context and thus scheduling is not possible. This is only a possible config on arm and i386. Although need_resched has been shown to be set for over 100 jiffies while doing the iteration in __collapse_huge_page_copy, this is better than doing if (in_atomic()) cond_resched() to cover only non-CONFIG_HIGHPTE configs. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1706191341550.97821@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Larry Finger Tested-by: Larry Finger Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 945fd1ca49b5..df4ebdb2b10a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -652,7 +652,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, spin_unlock(ptl); free_page_and_swap_cache(src_page); } - cond_resched(); } } From 029c54b09599573015a5c18dbe59cbdf42742237 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 23 Jun 2017 15:08:41 -0700 Subject: [PATCH 109/149] mm/vmalloc.c: huge-vmap: fail gracefully on unexpected huge vmap mappings Existing code that uses vmalloc_to_page() may assume that any address for which is_vmalloc_addr() returns true may be passed into vmalloc_to_page() to retrieve the associated struct page. This is not un unreasonable assumption to make, but on architectures that have CONFIG_HAVE_ARCH_HUGE_VMAP=y, it no longer holds, and we need to ensure that vmalloc_to_page() does not go off into the weeds trying to dereference huge PUDs or PMDs as table entries. Given that vmalloc() and vmap() themselves never create huge mappings or deal with compound pages at all, there is no correct answer in this case, so return NULL instead, and issue a warning. When reading /proc/kcore on arm64, you will hit an oops as soon as you hit the huge mappings used for the various segments that make up the mapping of vmlinux. With this patch applied, you will no longer hit the oops, but the kcore contents willl be incorrect (these regions will be zeroed out) We are fixing this for kcore specifically, so it avoids vread() for those regions. At least one other problematic user exists, i.e., /dev/kmem, but that is currently broken on arm64 for other reasons. Link: http://lkml.kernel.org/r/20170609082226.26152-1-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Mark Rutland Reviewed-by: Laura Abbott Cc: Michal Hocko Cc: zhong jiang Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 34a1c3e46ed7..ecc97f74ab18 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -287,10 +287,21 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (p4d_none(*p4d)) return NULL; pud = pud_offset(p4d, addr); - if (pud_none(*pud)) + + /* + * Don't dereference bad PUD or PMD (below) entries. This will also + * identify huge mappings, which we may encounter on architectures + * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be + * identified as vmalloc addresses by is_vmalloc_addr(), but are + * not [unambiguously] associated with a struct page, so there is + * no correct value to return for them. + */ + WARN_ON_ONCE(pud_bad(*pud)); + if (pud_none(*pud) || pud_bad(*pud)) return NULL; pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + WARN_ON_ONCE(pmd_bad(*pmd)); + if (pmd_none(*pmd) || pmd_bad(*pmd)) return NULL; ptep = pte_offset_map(pmd, addr); From 9fa4eb8e490a28de40964b1b0e583d8db4c7e57c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 23 Jun 2017 15:08:43 -0700 Subject: [PATCH 110/149] autofs: sanity check status reported with AUTOFS_DEV_IOCTL_FAIL If a positive status is passed with the AUTOFS_DEV_IOCTL_FAIL ioctl, autofs4_d_automount() will return ERR_PTR(status) with that status to follow_automount(), which will then dereference an invalid pointer. So treat a positive status the same as zero, and map to ENOENT. See comment in systemd src/core/automount.c::automount_send_ready(). Link: http://lkml.kernel.org/r/871sqwczx5.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Cc: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/dev-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 734cbf8d9676..dd9f1bebb5a3 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -344,7 +344,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, int status; token = (autofs_wqt_t) param->fail.token; - status = param->fail.status ? param->fail.status : -ENOENT; + status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } From 1eb643d02b21412e603b42cdd96010a2ac31c05f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 23 Jun 2017 15:08:46 -0700 Subject: [PATCH 111/149] fs/dax.c: fix inefficiency in dax_writeback_mapping_range() dax_writeback_mapping_range() fails to update iteration index when searching radix tree for entries needing cache flushing. Thus each pagevec worth of entries is searched starting from the start which is inefficient and prone to livelocks. Update index properly. Link: http://lkml.kernel.org/r/20170619124531.21491-1-jack@suse.cz Fixes: 9973c98ecfda3 ("dax: add support for fsync/sync") Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Cc: Dan Williams Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dax.c b/fs/dax.c index 2a6889b3585f..9187f3b07f3e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -859,6 +859,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (ret < 0) goto out; } + start_index = indices[pvec.nr - 1] + 1; } out: put_dax(dax_dev); From a91e0f680bcd9e10c253ae8b62462a38bd48f09f Mon Sep 17 00:00:00 2001 From: Ilya Matveychikov Date: Fri, 23 Jun 2017 15:08:49 -0700 Subject: [PATCH 112/149] lib/cmdline.c: fix get_options() overflow while parsing ranges When using get_options() it's possible to specify a range of numbers, like 1-100500. The problem is that it doesn't track array size while calling internally to get_range() which iterates over the range and fills the memory with numbers. Link: http://lkml.kernel.org/r/2613C75C-B04D-4BFF-82A6-12F97BA0F620@gmail.com Signed-off-by: Ilya V. Matveychikov Cc: Jonathan Corbet Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/cmdline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index 3c6432df7e63..4c0888c4a68d 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -23,14 +23,14 @@ * the values[M, M+1, ..., N] into the ints array in get_options. */ -static int get_range(char **str, int *pint) +static int get_range(char **str, int *pint, int n) { int x, inc_counter, upper_range; (*str)++; upper_range = simple_strtol((*str), NULL, 0); inc_counter = upper_range - *pint; - for (x = *pint; x < upper_range; x++) + for (x = *pint; n && x < upper_range; x++, n--) *pint++ = x; return inc_counter; } @@ -97,7 +97,7 @@ char *get_options(const char *str, int nints, int *ints) break; if (res == 3) { int range_nums; - range_nums = get_range((char **)&str, ints + i); + range_nums = get_range((char **)&str, ints + i, nints - i); if (range_nums < 0) break; /* From 3b7b314053d021601940c50b07f5f1423ae67e21 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 23 Jun 2017 15:08:52 -0700 Subject: [PATCH 113/149] slub: make sysfs file removal asynchronous Commit bf5eb3de3847 ("slub: separate out sysfs_slab_release() from sysfs_slab_remove()") made slub sysfs file removals synchronous to kmem_cache shutdown. Unfortunately, this created a possible ABBA deadlock between slab_mutex and sysfs draining mechanism triggering the following lockdep warning. ====================================================== [ INFO: possible circular locking dependency detected ] 4.10.0-test+ #48 Not tainted ------------------------------------------------------- rmmod/1211 is trying to acquire lock: (s_active#120){++++.+}, at: [] kernfs_remove+0x23/0x40 but task is already holding lock: (slab_mutex){+.+.+.}, at: [] kmem_cache_destroy+0x41/0x2d0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (slab_mutex){+.+.+.}: lock_acquire+0xf6/0x1f0 __mutex_lock+0x75/0x950 mutex_lock_nested+0x1b/0x20 slab_attr_store+0x75/0xd0 sysfs_kf_write+0x45/0x60 kernfs_fop_write+0x13c/0x1c0 __vfs_write+0x28/0x120 vfs_write+0xc8/0x1e0 SyS_write+0x49/0xa0 entry_SYSCALL_64_fastpath+0x1f/0xc2 -> #0 (s_active#120){++++.+}: __lock_acquire+0x10ed/0x1260 lock_acquire+0xf6/0x1f0 __kernfs_remove+0x254/0x320 kernfs_remove+0x23/0x40 sysfs_remove_dir+0x51/0x80 kobject_del+0x18/0x50 __kmem_cache_shutdown+0x3e6/0x460 kmem_cache_destroy+0x1fb/0x2d0 kvm_exit+0x2d/0x80 [kvm] vmx_exit+0x19/0xa1b [kvm_intel] SyS_delete_module+0x198/0x1f0 entry_SYSCALL_64_fastpath+0x1f/0xc2 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(slab_mutex); lock(s_active#120); lock(slab_mutex); lock(s_active#120); *** DEADLOCK *** 2 locks held by rmmod/1211: #0: (cpu_hotplug.dep_map){++++++}, at: [] get_online_cpus+0x37/0x80 #1: (slab_mutex){+.+.+.}, at: [] kmem_cache_destroy+0x41/0x2d0 stack backtrace: CPU: 3 PID: 1211 Comm: rmmod Not tainted 4.10.0-test+ #48 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v02.05 05/07/2012 Call Trace: print_circular_bug+0x1be/0x210 __lock_acquire+0x10ed/0x1260 lock_acquire+0xf6/0x1f0 __kernfs_remove+0x254/0x320 kernfs_remove+0x23/0x40 sysfs_remove_dir+0x51/0x80 kobject_del+0x18/0x50 __kmem_cache_shutdown+0x3e6/0x460 kmem_cache_destroy+0x1fb/0x2d0 kvm_exit+0x2d/0x80 [kvm] vmx_exit+0x19/0xa1b [kvm_intel] SyS_delete_module+0x198/0x1f0 ? SyS_delete_module+0x5/0x1f0 entry_SYSCALL_64_fastpath+0x1f/0xc2 It'd be the cleanest to deal with the issue by removing sysfs files without holding slab_mutex before the rest of shutdown; however, given the current code structure, it is pretty difficult to do so. This patch punts sysfs file removal to a work item. Before commit bf5eb3de3847, the removal was punted to a RCU delayed work item which is executed after release. Now, we're punting to a different work item on shutdown which still maintains the goal removing the sysfs files earlier when destroying kmem_caches. Link: http://lkml.kernel.org/r/20170620204512.GI21326@htj.duckdns.org Fixes: bf5eb3de3847 ("slub: separate out sysfs_slab_release() from sysfs_slab_remove()") Signed-off-by: Tejun Heo Reported-by: Steven Rostedt (VMware) Tested-by: Steven Rostedt (VMware) Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 1 + mm/slub.c | 40 ++++++++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 07ef550c6627..93315d6b21a8 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -84,6 +84,7 @@ struct kmem_cache { int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ + struct work_struct kobj_remove_work; #endif #ifdef CONFIG_MEMCG struct memcg_cache_params memcg_params; diff --git a/mm/slub.c b/mm/slub.c index 7449593fca72..8addc535bcdc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5625,6 +5625,28 @@ static char *create_unique_id(struct kmem_cache *s) return name; } +static void sysfs_slab_remove_workfn(struct work_struct *work) +{ + struct kmem_cache *s = + container_of(work, struct kmem_cache, kobj_remove_work); + + if (!s->kobj.state_in_sysfs) + /* + * For a memcg cache, this may be called during + * deactivation and again on shutdown. Remove only once. + * A cache is never shut down before deactivation is + * complete, so no need to worry about synchronization. + */ + return; + +#ifdef CONFIG_MEMCG + kset_unregister(s->memcg_kset); +#endif + kobject_uevent(&s->kobj, KOBJ_REMOVE); + kobject_del(&s->kobj); + kobject_put(&s->kobj); +} + static int sysfs_slab_add(struct kmem_cache *s) { int err; @@ -5632,6 +5654,8 @@ static int sysfs_slab_add(struct kmem_cache *s) struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); + INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn); + if (!kset) { kobject_init(&s->kobj, &slab_ktype); return 0; @@ -5695,20 +5719,8 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; - if (!s->kobj.state_in_sysfs) - /* - * For a memcg cache, this may be called during - * deactivation and again on shutdown. Remove only once. - * A cache is never shut down before deactivation is - * complete, so no need to worry about synchronization. - */ - return; - -#ifdef CONFIG_MEMCG - kset_unregister(s->memcg_kset); -#endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); - kobject_del(&s->kobj); + kobject_get(&s->kobj); + schedule_work(&s->kobj_remove_work); } void sysfs_slab_release(struct kmem_cache *s) From 8818efaaacb78c60a9d90c5705b6c99b75d7d442 Mon Sep 17 00:00:00 2001 From: Eric Ren Date: Fri, 23 Jun 2017 15:08:55 -0700 Subject: [PATCH 114/149] ocfs2: fix deadlock caused by recursive locking in xattr Another deadlock path caused by recursive locking is reported. This kind of issue was introduced since commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()"). Two deadlock paths have been fixed by commit b891fa5024a9 ("ocfs2: fix deadlock issue when taking inode lock at vfs entry points"). Yes, we intend to fix this kind of case in incremental way, because it's hard to find out all possible paths at once. This one can be reproduced like this. On node1, cp a large file from home directory to ocfs2 mountpoint. While on node2, run setfacl/getfacl. Both nodes will hang up there. The backtraces: On node1: __ocfs2_cluster_lock.isra.39+0x357/0x740 [ocfs2] ocfs2_inode_lock_full_nested+0x17d/0x840 [ocfs2] ocfs2_write_begin+0x43/0x1a0 [ocfs2] generic_perform_write+0xa9/0x180 __generic_file_write_iter+0x1aa/0x1d0 ocfs2_file_write_iter+0x4f4/0xb40 [ocfs2] __vfs_write+0xc3/0x130 vfs_write+0xb1/0x1a0 SyS_write+0x46/0xa0 On node2: __ocfs2_cluster_lock.isra.39+0x357/0x740 [ocfs2] ocfs2_inode_lock_full_nested+0x17d/0x840 [ocfs2] ocfs2_xattr_set+0x12e/0xe80 [ocfs2] ocfs2_set_acl+0x22d/0x260 [ocfs2] ocfs2_iop_set_acl+0x65/0xb0 [ocfs2] set_posix_acl+0x75/0xb0 posix_acl_xattr_set+0x49/0xa0 __vfs_setxattr+0x69/0x80 __vfs_setxattr_noperm+0x72/0x1a0 vfs_setxattr+0xa7/0xb0 setxattr+0x12d/0x190 path_setxattr+0x9f/0xb0 SyS_setxattr+0x14/0x20 Fix this one by using ocfs2_inode_{lock|unlock}_tracker, which is exported by commit 439a36b8ef38 ("ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock"). Link: http://lkml.kernel.org/r/20170622014746.5815-1-zren@suse.com Fixes: 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") Signed-off-by: Eric Ren Reported-by: Thomas Voegtle Tested-by: Thomas Voegtle Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 4 ++++ fs/ocfs2/xattr.c | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3b7c937a36b5..4689940a953c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2591,6 +2591,10 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, struct ocfs2_lock_res *lockres; lockres = &OCFS2_I(inode)->ip_inode_lockres; + /* had_lock means that the currect process already takes the cluster + * lock previously. If had_lock is 1, we have nothing to do here, and + * it will get unlocked where we got the lock. + */ if (!had_lock) { ocfs2_remove_holder(lockres, oh); ocfs2_inode_unlock(inode, ex); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 3c5384d9b3a5..f70c3778d600 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1328,20 +1328,21 @@ static int ocfs2_xattr_get(struct inode *inode, void *buffer, size_t buffer_size) { - int ret; + int ret, had_lock; struct buffer_head *di_bh = NULL; + struct ocfs2_lock_holder oh; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - return ret; + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh); + if (had_lock < 0) { + mlog_errno(had_lock); + return had_lock; } down_read(&OCFS2_I(inode)->ip_xattr_sem); ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, name, buffer, buffer_size); up_read(&OCFS2_I(inode)->ip_xattr_sem); - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -3537,11 +3538,12 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits, ref_meta = 0, ref_credits = 0; + int ret, credits, had_lock, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; struct ocfs2_refcount_tree *ref_tree = NULL; + struct ocfs2_lock_holder oh; struct ocfs2_xattr_info xi = { .xi_name_index = name_index, @@ -3572,8 +3574,9 @@ int ocfs2_xattr_set(struct inode *inode, return -ENOMEM; } - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { + had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 1, &oh); + if (had_lock < 0) { + ret = had_lock; mlog_errno(ret); goto cleanup_nolock; } @@ -3670,7 +3673,7 @@ cleanup: if (ret) mlog_errno(ret); } - ocfs2_inode_unlock(inode, 1); + ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock); cleanup_nolock: brelse(di_bh); brelse(xbs.xattr_bh); From 98da7d08850fb8bdeb395d6368ed15753304aa0c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Jun 2017 15:08:57 -0700 Subject: [PATCH 115/149] fs/exec.c: account for argv/envp pointers When limiting the argv/envp strings during exec to 1/4 of the stack limit, the storage of the pointers to the strings was not included. This means that an exec with huge numbers of tiny strings could eat 1/4 of the stack limit in strings and then additional space would be later used by the pointers to the strings. For example, on 32-bit with a 8MB stack rlimit, an exec with 1677721 single-byte strings would consume less than 2MB of stack, the max (8MB / 4) amount allowed, but the pointers to the strings would consume the remaining additional stack space (1677721 * 4 == 6710884). The result (1677721 + 6710884 == 8388605) would exhaust stack space entirely. Controlling this stack exhaustion could result in pathological behavior in setuid binaries (CVE-2017-1000365). [akpm@linux-foundation.org: additional commenting from Kees] Fixes: b6a2fea39318 ("mm: variable length argument support") Link: http://lkml.kernel.org/r/20170622001720.GA32173@beast Signed-off-by: Kees Cook Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Alexander Viro Cc: Qualys Security Advisory Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 72934df68471..904199086490 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -220,8 +220,26 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, if (write) { unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; + unsigned long ptr_size; struct rlimit *rlim; + /* + * Since the stack will hold pointers to the strings, we + * must account for them as well. + * + * The size calculation is the entire vma while each arg page is + * built, so each time we get here it's calculating how far it + * is currently (rather than each call being just the newly + * added size from the arg page). As a result, we need to + * always add the entire size of the pointers, so that on the + * last call to get_arg_page() we'll actually have the entire + * correct size. + */ + ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + if (ptr_size > ULONG_MAX - size) + goto fail; + size += ptr_size; + acct_arg_size(bprm, size / PAGE_SIZE); /* @@ -239,13 +257,15 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ rlim = current->signal->rlim; - if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { - put_page(page); - return NULL; - } + if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) + goto fail; } return page; + +fail: + put_page(page); + return NULL; } static void put_arg_page(struct page *page) From 26fcd952d5c977a94ac64bb44ed409e37607b2c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 23 Jun 2017 10:50:38 +0200 Subject: [PATCH 116/149] x86/mshyperv: Remove excess #includes from mshyperv.h A recent commit included linux/slab.h in linux/irq.h. This breaks the build of vdso32 on a 64-bit kernel. The reason is that linux/irq.h gets included into the vdso code via linux/interrupt.h which is included from asm/mshyperv.h. That makes the 32-bit vdso compile fail, because slab.h includes the pgtable headers for 64-bit on a 64-bit build. Neither linux/clocksource.h nor linux/interrupt.h are needed in the mshyperv.h header file itself - it has a dependency on . Remove the includes and unbreak the build. Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: K. Y. Srinivasan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Vitaly Kuznetsov Cc: devel@linuxdriverproject.org Fixes: dee863b571b0 ("hv: export current Hyper-V clocksource") Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1706231038460.2647@nanos Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mshyperv.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index fba100713924..d5acc27ed1cc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -2,8 +2,7 @@ #define _ASM_X86_MSHYPER_H #include -#include -#include +#include #include /* From d0c32a16235aeacd32c9de6ff90f9219614d7e4e Mon Sep 17 00:00:00 2001 From: "Mintz, Yuval" Date: Sat, 24 Jun 2017 15:37:00 +0300 Subject: [PATCH 117/149] bnx2x: Don't log mc removal needlessly When mc configuration changes bnx2x_config_mcast() can return 0 for success, negative for failure and positive for benign reason preventing its immediate work, e.g., when the command awaits the completion of a previously sent command. When removing all configured macs on a 578xx adapter, if a positive value would be returned driver would errneously log it as an error. Fixes: c7b7b483ccc9 ("bnx2x: Don't flush multicast MACs") Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index a851f95c307a..349a46593abf 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -12729,7 +12729,7 @@ static int bnx2x_set_mc_list(struct bnx2x *bp) } else { /* If no mc addresses are required, flush the configuration */ rc = bnx2x_config_mcast(bp, &rparam, BNX2X_MCAST_CMD_DEL); - if (rc) + if (rc < 0) BNX2X_ERR("Failed to clear multicast configuration %d\n", rc); } From 85cb73ff9b74785a7fc752875d7f0fe17ca3ea7c Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 23 Jun 2017 15:25:37 -0700 Subject: [PATCH 118/149] net: ipv6: reset daddr and dport in sk if connect() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In __ip6_datagram_connect(), reset sk->sk_v6_daddr and inet->dport if error occurs. In udp_v6_early_demux(), check for sk_state to make sure it is in TCP_ESTABLISHED state. Together, it makes sure unconnected UDP socket won't be considered as a valid candidate for early demux. v3: add TCP_ESTABLISHED state check in udp_v6_early_demux() v2: fix compilation error Fixes: 5425077d73e0 ("net: ipv6: Add early demux handler for UDP unicast") Signed-off-by: Wei Wang Acked-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 8 +++++++- net/ipv6/udp.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index e011122ebd43..5c786f5ab961 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -250,8 +250,14 @@ ipv4_connected: */ err = ip6_datagram_dst_update(sk, true); - if (err) + if (err) { + /* Reset daddr and dport so that udp_v6_early_demux() + * fails to find this socket + */ + memset(&sk->sk_v6_daddr, 0, sizeof(sk->sk_v6_daddr)); + inet->inet_dport = 0; goto out; + } sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 06ec39b79609..75703fda23e7 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -879,7 +879,8 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, struct sock *sk; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - if (INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif)) + if (sk->sk_state == TCP_ESTABLISHED && + INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif)) return sk; /* Only check first socket in chain */ break; From d747a7a51b00984127a88113cdbbc26f91e9d815 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 24 Jun 2017 23:50:30 -0700 Subject: [PATCH 119/149] tcp: reset sk_rx_dst in tcp_disconnect() We have to reset the sk->sk_rx_dst when we disconnect a TCP connection, because otherwise when we re-connect it this dst reference is simply overridden in tcp_finish_connect(). This fixes a dst leak which leads to a loopback dev refcnt leak. It is a long-standing bug, Kevin reported a very similar (if not same) bug before. Thanks to Andrei for providing such a reliable reproducer which greatly narrows down the problem. Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.") Reported-by: Andrei Vagin Reported-by: Kevin Xu Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b5ea036ca781..40aca7803cf2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2330,6 +2330,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); + dst_release(sk->sk_rx_dst); + sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); /* Clean up fastopen related fields */ From c0bc126f97fb929b3ae02c1c62322645d70eb408 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Jun 2017 18:30:05 -0700 Subject: [PATCH 120/149] Linux 4.12-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 57df7569c1bf..6d8a984ed9c9 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Fearless Coyote # *DOCUMENTATION* From d8462d0ad350ccd399899de051060ee562a5e537 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 20 Jun 2017 13:43:19 +0100 Subject: [PATCH 121/149] drm/i915: Retire the VMA's fence tracker before unbinding Since we may track unfenced access (GPU access to the vma that explicitly requires no fence), vma->last_fence may be set without any attached fence (vma->fence) and so will not be flushed when we call i915_vma_put_fence(). Since we stopped doing a full retire of the activity trackers for unbind, we need to explicitly retire each tracker. Fixes: b0decaf75bd9 ("drm/i915: Track active vma requests") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Link: http://patchwork.freedesktop.org/patch/msgid/20170620124321.1108-1-chris@chris-wilson.co.uk Reviewed-by: Tvrtko Ursulin (cherry picked from commit 760a898d8069111704e1bd43f00ebf369ae46e57) Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_vma.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 1aba47024656..f066e2d785f5 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -650,6 +650,11 @@ int i915_vma_unbind(struct i915_vma *vma) break; } + if (!ret) { + ret = i915_gem_active_retire(&vma->last_fence, + &vma->vm->i915->drm.struct_mutex); + } + __i915_vma_unpin(vma); if (ret) return ret; From 2c7367626733e27d6f6d9906db7a31ada587566b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 17 Jun 2017 12:57:44 +0100 Subject: [PATCH 122/149] drm/i915: Hold struct_mutex for per-file stats in debugfs/i915_gem_object As we walk the obj->vma_list in per_file_stats(), we need to hold struct_mutex to prevent alteration of that list. Fixes: 1d2ac403ae3b ("drm: Protect dev->filelist with its own mutex") Fixes: c84455b4bacc ("drm/i915: Move debug only per-request pid tracking from request to ctx") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101460 Signed-off-by: Chris Wilson Cc: Daniel Vetter Cc: Joonas Lahtinen Link: http://patchwork.freedesktop.org/patch/msgid/20170617115744.4452-1-chris@chris-wilson.co.uk Reviewed-by: Tvrtko Ursulin (cherry picked from commit 0caf81b5c53d9bd332a95dbcb44db8de0b397a7c) Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_debugfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index d689e511744e..4bd1467c17b1 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -292,6 +292,8 @@ static int per_file_stats(int id, void *ptr, void *data) struct file_stats *stats = data; struct i915_vma *vma; + lockdep_assert_held(&obj->base.dev->struct_mutex); + stats->count++; stats->total += obj->base.size; if (!obj->bind_count) @@ -476,6 +478,8 @@ static int i915_gem_object_info(struct seq_file *m, void *data) struct drm_i915_gem_request *request; struct task_struct *task; + mutex_lock(&dev->struct_mutex); + memset(&stats, 0, sizeof(stats)); stats.file_priv = file->driver_priv; spin_lock(&file->table_lock); @@ -487,7 +491,6 @@ static int i915_gem_object_info(struct seq_file *m, void *data) * still alive (e.g. get_pid(current) => fork() => exit()). * Therefore, we need to protect this ->comm access using RCU. */ - mutex_lock(&dev->struct_mutex); request = list_first_entry_or_null(&file_priv->mm.request_list, struct drm_i915_gem_request, client_link); @@ -497,6 +500,7 @@ static int i915_gem_object_info(struct seq_file *m, void *data) PIDTYPE_PID); print_file_stats(m, task ? task->comm : "", stats); rcu_read_unlock(); + mutex_unlock(&dev->struct_mutex); } mutex_unlock(&dev->filelist_mutex); From 611cdf3695a307fdca3ff3779a1e6cca935e2d31 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 16 Jun 2017 15:05:18 +0100 Subject: [PATCH 123/149] drm/i915: Disable EXEC_OBJECT_ASYNC when doing relocations If we write a relocation into the buffer, we require our own implicit synchronisation added after the start of the execbuf, outside of the user's control. As we may end up clflushing, or doing the patch itself on the GPU, asynchronously we need to look at the implicit serialisation on obj->resv and hence need to disable EXEC_OBJECT_ASYNC for this object. If the user does trigger a stall for relocations, we make sure the stall is complete enough so that the batch is not submitted before we complete those relocations. Fixes: 77ae9957897d ("drm/i915: Enable userspace to opt-out of implicit fencing") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Jason Ekstrand Reviewed-by: Joonas Lahtinen (cherry picked from commit 071750e550af46b5d3a84ad56c2a108c3e136284) [danvet: Resolve conflicts, resolution reviewed by Tvrtko on irc.] Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index a3e59c8ef27b..9ad13eeed904 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -546,11 +546,12 @@ repeat: } static int -i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, +i915_gem_execbuffer_relocate_entry(struct i915_vma *vma, struct eb_vmas *eb, struct drm_i915_gem_relocation_entry *reloc, struct reloc_cache *cache) { + struct drm_i915_gem_object *obj = vma->obj; struct drm_i915_private *dev_priv = to_i915(obj->base.dev); struct drm_gem_object *target_obj; struct drm_i915_gem_object *target_i915_obj; @@ -628,6 +629,16 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, return -EINVAL; } + /* + * If we write into the object, we need to force the synchronisation + * barrier, either with an asynchronous clflush or if we executed the + * patching using the GPU (though that should be serialised by the + * timeline). To be completely sure, and since we are required to + * do relocations we are already stalling, disable the user's opt + * of our synchronisation. + */ + vma->exec_entry->flags &= ~EXEC_OBJECT_ASYNC; + ret = relocate_entry(obj, reloc, cache, target_offset); if (ret) return ret; @@ -678,7 +689,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, do { u64 offset = r->presumed_offset; - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, r, &cache); + ret = i915_gem_execbuffer_relocate_entry(vma, eb, r, &cache); if (ret) goto out; @@ -726,7 +737,7 @@ i915_gem_execbuffer_relocate_vma_slow(struct i915_vma *vma, reloc_cache_init(&cache, eb->i915); for (i = 0; i < entry->relocation_count; i++) { - ret = i915_gem_execbuffer_relocate_entry(vma->obj, eb, &relocs[i], &cache); + ret = i915_gem_execbuffer_relocate_entry(vma, eb, &relocs[i], &cache); if (ret) break; } From 82fcee526ba8ca2c5d378bdf51b21b7eb058fe3a Mon Sep 17 00:00:00 2001 From: Deepak Rawat Date: Mon, 26 Jun 2017 14:39:08 +0200 Subject: [PATCH 124/149] drm/vmwgfx: Free hash table allocated by cmdbuf managed res mgr The hash table created during vmw_cmdbuf_res_man_create was never freed. This causes memory leak in context creation. Added the corresponding drm_ht_remove in vmw_cmdbuf_res_man_destroy. Tested for memory leak by running piglit overnight and kernel memory is not inflated which earlier was. Cc: Signed-off-by: Deepak Rawat Reviewed-by: Sinclair Yeh Signed-off-by: Thomas Hellstrom --- drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c index 13db8a2851ed..1f013d45c9e9 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c @@ -321,6 +321,7 @@ void vmw_cmdbuf_res_man_destroy(struct vmw_cmdbuf_res_manager *man) list_for_each_entry_safe(entry, next, &man->list, head) vmw_cmdbuf_res_free(man, entry); + drm_ht_remove(&man->resources); kfree(man); } From 6f64ec74515925cced6df4571638b5a099a49aae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jun 2017 07:02:20 -0700 Subject: [PATCH 125/149] net: prevent sign extension in dev_get_stats() Similar to the fix provided by Dominik Heidler in commit 9b3dc0a17d73 ("l2tp: cast l2tp traffic counter to unsigned") we need to take care of 32bit kernels in dev_get_stats(). When using atomic_long_read(), we add a 'long' to u64 and might misinterpret high order bit, unless we cast to unsigned. Fixes: caf586e5f23ce ("net: add a core netdev->rx_dropped counter") Fixes: 015f0688f57ca ("net: net: add a core netdev->tx_dropped counter") Fixes: 6e7333d315a76 ("net: add rx_nohandler stat counter") Signed-off-by: Eric Dumazet Cc: Jarod Wilson Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 7243421c9783..91bb55070533 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7783,9 +7783,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } else { netdev_stats_to_stats64(storage, &dev->stats); } - storage->rx_dropped += atomic_long_read(&dev->rx_dropped); - storage->tx_dropped += atomic_long_read(&dev->tx_dropped); - storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); + storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped); + storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped); + storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); From 00a0ea33b495ee6149bf5a77ac5807ce87323abb Mon Sep 17 00:00:00 2001 From: Vallish Vaidyeshwara Date: Fri, 23 Jun 2017 18:53:06 +0000 Subject: [PATCH 126/149] dm thin: do not queue freed thin mapping for next stage processing process_prepared_discard_passdown_pt1() should cleanup dm_thin_new_mapping in cases of error. dm_pool_inc_data_range() can fail trying to get a block reference: metadata operation 'dm_pool_inc_data_range' failed: error = -61 When dm_pool_inc_data_range() fails, dm thin aborts current metadata transaction and marks pool as PM_READ_ONLY. Memory for thin mapping is released as well. However, current thin mapping will be queued onto next stage as part of queue_passdown_pt2() or passdown_endio(). This dangling thin mapping memory when processed and accessed in next stage will lead to device mapper crashing. Code flow without fix: -> process_prepared_discard_passdown_pt1(m) -> dm_thin_remove_range() -> discard passdown --> passdown_endio(m) queues m onto next stage -> dm_pool_inc_data_range() fails, frees memory m but does not remove it from next stage queue -> process_prepared_discard_passdown_pt2(m) -> processes freed memory m and crashes One such stack: Call Trace: [] dm_cell_release_no_holder+0x2f/0x70 [dm_bio_prison] [] cell_defer_no_holder+0x3c/0x80 [dm_thin_pool] [] process_prepared_discard_passdown_pt2+0x4b/0x90 [dm_thin_pool] [] process_prepared+0x81/0xa0 [dm_thin_pool] [] do_worker+0xc5/0x820 [dm_thin_pool] [] ? __schedule+0x244/0x680 [] ? pwq_activate_delayed_work+0x42/0xb0 [] process_one_work+0x153/0x3f0 [] worker_thread+0x12b/0x4b0 [] ? rescuer_thread+0x350/0x350 [] kthread+0xca/0xe0 [] ? kthread_park+0x60/0x60 [] ret_from_fork+0x25/0x30 The fix is to first take the block ref count for discarded block and then do a passdown discard of this block. If block ref count fails, then bail out aborting current metadata transaction, mark pool as PM_READ_ONLY and also free current thin mapping memory (existing error handling code) without queueing this thin mapping onto next stage of processing. If block ref count succeeds, then passdown discard of this block. Discard callback of passdown_endio() will queue this thin mapping onto next stage of processing. Code flow with fix: -> process_prepared_discard_passdown_pt1(m) -> dm_thin_remove_range() -> dm_pool_inc_data_range() --> if fails, free memory m and bail out -> discard passdown --> passdown_endio(m) queues m onto next stage Cc: stable # v4.9+ Reviewed-by: Eduardo Valentin Reviewed-by: Cristian Gafton Reviewed-by: Anchal Agarwal Signed-off-by: Vallish Vaidyeshwara Reviewed-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 17ad50daed08..28808e5ec0fd 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1094,6 +1094,19 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) return; } + /* + * Increment the unmapped blocks. This prevents a race between the + * passdown io and reallocation of freed blocks. + */ + r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); + if (r) { + metadata_operation_failed(pool, "dm_pool_inc_data_range", r); + bio_io_error(m->bio); + cell_defer_no_holder(tc, m->cell); + mempool_free(m, pool->mapping_pool); + return; + } + discard_parent = bio_alloc(GFP_NOIO, 1); if (!discard_parent) { DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.", @@ -1114,19 +1127,6 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m) end_discard(&op, r); } } - - /* - * Increment the unmapped blocks. This prevents a race between the - * passdown io and reallocation of freed blocks. - */ - r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end); - if (r) { - metadata_operation_failed(pool, "dm_pool_inc_data_range", r); - bio_io_error(m->bio); - cell_defer_no_holder(tc, m->cell); - mempool_free(m, pool->mapping_pool); - return; - } } static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) From 85688d9adf68557233d974ef28971e69b89fb690 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 26 Jun 2017 18:47:00 +0300 Subject: [PATCH 127/149] fsl/fman: add dependency on HAS_DMA A previous commit (5567e989198b5a8d) inserted a dependency on DMA API that requires HAS_DMA to be added in Kconfig. Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fman/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/fman/Kconfig b/drivers/net/ethernet/freescale/fman/Kconfig index dc0850b3b517..8870a9a798ca 100644 --- a/drivers/net/ethernet/freescale/fman/Kconfig +++ b/drivers/net/ethernet/freescale/fman/Kconfig @@ -2,6 +2,7 @@ config FSL_FMAN tristate "FMan support" depends on FSL_SOC || ARCH_LAYERSCAPE || COMPILE_TEST select GENERIC_ALLOCATOR + depends on HAS_DMA select PHYLIB default n help From e20bd60bf62a2448be873653c7febca1d4d73afc Mon Sep 17 00:00:00 2001 From: "Andrew F. Davis" Date: Mon, 26 Jun 2017 12:41:20 -0500 Subject: [PATCH 128/149] net: usb: asix88179_178a: Add support for the Belkin B2B128 The Belkin B2B128 is a USB 3.0 Hub + Gigabit Ethernet Adapter, the Ethernet adapter uses the ASIX AX88179 USB 3.0 to Gigabit Ethernet chip supported by this driver, add the USB ID for the same. This patch is based on work by Geoffrey Tran who has indicated they would like this upstreamed by someone more familiar with the upstreaming process. Signed-off-by: Andrew F. Davis Signed-off-by: David S. Miller --- drivers/net/usb/ax88179_178a.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 51cf60092a18..4037ab27734a 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1722,6 +1722,18 @@ static const struct driver_info lenovo_info = { .tx_fixup = ax88179_tx_fixup, }; +static const struct driver_info belkin_info = { + .description = "Belkin USB Ethernet Adapter", + .bind = ax88179_bind, + .unbind = ax88179_unbind, + .status = ax88179_status, + .link_reset = ax88179_link_reset, + .reset = ax88179_reset, + .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .rx_fixup = ax88179_rx_fixup, + .tx_fixup = ax88179_tx_fixup, +}; + static const struct usb_device_id products[] = { { /* ASIX AX88179 10/100/1000 */ @@ -1751,6 +1763,10 @@ static const struct usb_device_id products[] = { /* Lenovo OneLinkDock Gigabit LAN */ USB_DEVICE(0x17ef, 0x304b), .driver_info = (unsigned long)&lenovo_info, +}, { + /* Belkin B2B128 USB 3.0 Hub + Gigabit Ethernet Adapter */ + USB_DEVICE(0x050d, 0x0128), + .driver_info = (unsigned long)&belkin_info, }, { }, }; From ebef7368571d88f0f80b817e6898075c62265b4e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 27 Jun 2017 17:44:05 -0600 Subject: [PATCH 129/149] nvme/pci: Fix stuck nvme reset The controller state is set to resetting prior to disabling the controller, so this patch accounts for that state when deciding if it needs to freeze the queues. Without this, an 'nvme reset /dev/nvme0' blocks forever because the queues were never frozen. Fixes: 82b057caefaf ("nvme-pci: fix multiple ctrl removal scheduling") Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 951042a375d6..40c7581caeb0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1805,7 +1805,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); - if (dev->ctrl.state == NVME_CTRL_LIVE) + if (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING) nvme_start_freeze(&dev->ctrl); dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || pdev->error_state != pci_channel_io_normal); From 898fc11bb2bd4fbcefb685872d9fffaba2c8edaf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Jun 2017 10:16:56 -0400 Subject: [PATCH 130/149] NFS: Trunking detection should handle ERESTARTSYS/EINTR Currently, it will return EIO in those cases. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b34de036501b..cbf82b0d4467 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2134,6 +2134,8 @@ again: put_rpccred(cred); switch (status) { case 0: + case -EINTR: + case -ERESTARTSYS: break; case -ETIMEDOUT: if (clnt->cl_softrtry) From bd171930e6a3de4f5cffdafbb944e50093dfb59b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 27 Jun 2017 17:33:38 -0400 Subject: [PATCH 131/149] NFSv4.1: Fix a race in nfs4_proc_layoutget If the task calling layoutget is signalled, then it is possible for the calls to nfs4_sequence_free_slot() and nfs4_layoutget_prepare() to race, in which case we leak a slot. The fix is to move the call to nfs4_sequence_free_slot() into the nfs4_layoutget_release() so that it gets called at task teardown time. Fixes: 2e80dbe7ac51 ("NFSv4.1: Close callback races for OPEN, LAYOUTGET...") Cc: stable@vger.kernel.org # v4.8+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e43fd1270c4..dbfa18900e25 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8417,6 +8417,7 @@ static void nfs4_layoutget_release(void *calldata) size_t max_pages = max_response_pages(server); dprintk("--> %s\n", __func__); + nfs4_sequence_free_slot(&lgp->res.seq_res); nfs4_free_pages(lgp->args.layout.pages, max_pages); pnfs_put_layout_hdr(NFS_I(inode)->layout); put_nfs_open_context(lgp->args.ctx); @@ -8491,7 +8492,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); - nfs4_sequence_free_slot(&lgp->res.seq_res); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) From d9f2950006f110f54444a10442752372ee568289 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 16 Jun 2017 11:12:59 -0400 Subject: [PATCH 132/149] Revert "NFS: nfs_rename() handle -ERESTARTSYS dentry left behind" This reverts commit 920b4530fb80430ff30ef83efe21ba1fa5623731 which could call d_move() without holding the directory's i_mutex, and reverts commit d4ea7e3c5c0e341c15b073016dbf3ab6c65f12f3 "NFS: Fix old dentry rehash after move", which was a follow-up fix. Signed-off-by: Benjamin Coddington Fixes: 920b4530fb80 ("NFS: nfs_rename() handle -ERESTARTSYS dentry left behind") Cc: stable@vger.kernel.org # v4.10+ Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 51 ++++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32ccd7754f8a..2ac00bf4ecf1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1946,29 +1946,6 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); -static void -nfs_complete_rename(struct rpc_task *task, struct nfs_renamedata *data) -{ - struct dentry *old_dentry = data->old_dentry; - struct dentry *new_dentry = data->new_dentry; - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - - nfs_mark_for_revalidate(old_inode); - - switch (task->tk_status) { - case 0: - if (new_inode != NULL) - nfs_drop_nlink(new_inode); - d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, - nfs_save_change_attribute(data->new_dir)); - break; - case -ENOENT: - nfs_dentry_handle_enoent(old_dentry); - } -} - /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -1999,7 +1976,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL; + struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; @@ -2022,8 +1999,10 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * To prevent any new references to the target during the * rename, we unhash the dentry in advance. */ - if (!d_unhashed(new_dentry)) + if (!d_unhashed(new_dentry)) { d_drop(new_dentry); + rehash = new_dentry; + } if (d_count(new_dentry) > 2) { int err; @@ -2040,6 +2019,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; + rehash = NULL; new_inode = NULL; } } @@ -2048,8 +2028,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_inode != NULL) NFS_PROTO(new_inode)->return_delegation(new_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, - nfs_complete_rename); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2059,9 +2038,27 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error == 0) error = task->tk_status; rpc_put_task(task); + nfs_mark_for_revalidate(old_inode); out: + if (rehash) + d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); + if (!error) { + if (new_inode != NULL) + nfs_drop_nlink(new_inode); + /* + * The d_move() should be here instead of in an async RPC completion + * handler because we need the proper locks to move the dentry. If + * we're interrupted by a signal, the async RPC completion handler + * should mark the directories for revalidation. + */ + d_move(old_dentry, new_dentry); + nfs_set_verifier(new_dentry, + nfs_save_change_attribute(new_dir)); + } else if (error == -ENOENT) + nfs_dentry_handle_enoent(old_dentry); + /* new dentry created? */ if (dentry) dput(dentry); From 2e31b4cb895ae78db31dffb860cd255d86c6561c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 27 Jun 2017 17:40:50 -0400 Subject: [PATCH 133/149] NFSv4.1: nfs4_callback_free_slot() cannot call nfs4_slot_tbl_drain_complete() The current code works only for the case where we have exactly one slot, which is no longer true. nfs4_free_slot() will automatically declare the callback channel to be drained when all slots have been returned. Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c14758e08d73..390ac9c39c59 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -753,7 +753,6 @@ static void nfs4_callback_free_slot(struct nfs4_session *session, * A single slot, so highest used slotid is either 0 or -1 */ nfs4_free_slot(tbl, slot); - nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } From f4a4381ba4944603905167d044d865990356ba22 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Tue, 27 Jun 2017 16:02:51 +0200 Subject: [PATCH 134/149] drm/etnaviv: fix submit flags getting overwritten by BO content The addition of the flags member to etnaviv_gem_submit structure didn't take into account that the last member of this structure is a variable length array. Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.h b/drivers/gpu/drm/etnaviv/etnaviv_gem.h index c4a091e87426..e437fba1209d 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.h +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.h @@ -106,9 +106,10 @@ struct etnaviv_gem_submit { struct etnaviv_gpu *gpu; struct ww_acquire_ctx ticket; struct dma_fence *fence; + u32 flags; unsigned int nr_bos; struct etnaviv_gem_submit_bo bos[0]; - u32 flags; + /* No new members here, the previous one is variable-length! */ }; int etnaviv_gem_wait_bo(struct etnaviv_gpu *gpu, struct drm_gem_object *obj, From 426ef1bb40a3bd99ccd16d629bd1294805e96fc6 Mon Sep 17 00:00:00 2001 From: Daniel Stone Date: Thu, 22 Jun 2017 12:22:22 +0100 Subject: [PATCH 135/149] drm/etnaviv: Fix implicit/explicit sync sense inversion We were reading the no-implicit sync flag the wrong way around, synchronizing too much for the explicit case, and not at all for the implicit case. Oops. Signed-off-by: Daniel Stone Signed-off-by: Lucas Stach --- drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c index de80ee1b71df..1013765274da 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c @@ -172,7 +172,7 @@ static int submit_fence_sync(const struct etnaviv_gem_submit *submit) for (i = 0; i < submit->nr_bos; i++) { struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj; bool write = submit->bos[i].flags & ETNA_SUBMIT_BO_WRITE; - bool explicit = !(submit->flags & ETNA_SUBMIT_NO_IMPLICIT); + bool explicit = !!(submit->flags & ETNA_SUBMIT_NO_IMPLICIT); ret = etnaviv_gpu_fence_sync_obj(etnaviv_obj, context, write, explicit); From 9ae3b3f52c62ddd5eb12c57f195f4f38121faa01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Jun 2017 15:30:13 -0600 Subject: [PATCH 136/149] block: provide bio_uninit() free freeing integrity/task associations Wen reports significant memory leaks with DIF and O_DIRECT: "With nvme devive + T10 enabled, On a system it has 256GB and started logging /proc/meminfo & /proc/slabinfo for every minute and in an hour it increased by 15968128 kB or ~15+GB.. Approximately 256 MB / minute leaking. /proc/meminfo | grep SUnreclaim... SUnreclaim: 6752128 kB SUnreclaim: 6874880 kB SUnreclaim: 7238080 kB .... SUnreclaim: 22307264 kB SUnreclaim: 22485888 kB SUnreclaim: 22720256 kB When testcases with T10 enabled call into __blkdev_direct_IO_simple, code doesn't free memory allocated by bio_integrity_alloc. The patch fixes the issue. HTX has been run with +60 hours without failure." Since __blkdev_direct_IO_simple() allocates the bio on the stack, it doesn't go through the regular bio free. This means that any ancillary data allocated with the bio through the stack is not freed. Hence, we can leak the integrity data associated with the bio, if the device is using DIF/DIX. Fix this by providing a bio_uninit() and export it, so that we can use it to free this data. Note that this is a minimal fix for this issue. Any current user of bio's that are allocated outside of bio_alloc_bioset() suffers from this issue, most notably some drivers. We will fix those in a more comprehensive patch for 4.13. This also means that the commit marked as being fixed by this isn't the real culprit, it's just the most obvious one out there. Fixes: 542ff7bf18c6 ("block: new direct I/O implementation") Reported-by: Wen Xiong Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 12 +++++++++--- fs/block_dev.c | 5 ++++- include/linux/bio.h | 1 + 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index 888e7801c638..26b0810fb8ea 100644 --- a/block/bio.c +++ b/block/bio.c @@ -240,20 +240,21 @@ fallback: return bvl; } -static void __bio_free(struct bio *bio) +void bio_uninit(struct bio *bio) { bio_disassociate_task(bio); if (bio_integrity(bio)) bio_integrity_free(bio); } +EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; void *p; - __bio_free(bio); + bio_uninit(bio); if (bs) { bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); @@ -271,6 +272,11 @@ static void bio_free(struct bio *bio) } } +/* + * Users of this function have their own bio allocation. Subsequently, + * they must remember to pair any call to bio_init() with bio_uninit() + * when IO has completed, or when the bio is released. + */ void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs) { @@ -297,7 +303,7 @@ void bio_reset(struct bio *bio) { unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - __bio_free(bio); + bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); bio->bi_flags = flags; diff --git a/fs/block_dev.c b/fs/block_dev.c index 519599dddd36..0a7404ef9335 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -263,7 +263,10 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, kfree(vecs); if (unlikely(bio.bi_error)) - return bio.bi_error; + ret = bio.bi_error; + + bio_uninit(&bio); + return ret; } diff --git a/include/linux/bio.h b/include/linux/bio.h index d1b04b0e99cf..a7e29fa0981f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -426,6 +426,7 @@ extern void bio_advance(struct bio *, unsigned); extern void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs); +extern void bio_uninit(struct bio *); extern void bio_reset(struct bio *); void bio_chain(struct bio *, struct bio *); From 6474924e2b5ddb0030c355558966adcbe3b49022 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 28 Jun 2017 15:30:02 +0200 Subject: [PATCH 137/149] arch: remove unused macro/function thread_saved_pc() The only user of thread_saved_pc() in non-arch-specific code was removed in commit 8243d5597793 ("sched/core: Remove pointless printout in sched_show_task()"). Remove the implementations as well. Some architectures use thread_saved_pc() in their arch-specific code. Leave their thread_saved_pc() intact. Signed-off-by: Tobias Klauser Acked-by: Geert Uytterhoeven Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/arc/include/asm/processor.h | 2 -- arch/blackfin/include/asm/processor.h | 5 ----- arch/c6x/include/asm/processor.h | 5 ----- arch/cris/arch-v10/kernel/process.c | 8 -------- arch/cris/arch-v32/kernel/process.c | 8 -------- arch/cris/include/asm/processor.h | 2 -- arch/frv/include/asm/processor.h | 5 ----- arch/frv/kernel/process.c | 9 --------- arch/h8300/include/asm/processor.h | 4 ---- arch/h8300/kernel/process.c | 5 ----- arch/hexagon/include/asm/processor.h | 3 --- arch/hexagon/kernel/process.c | 8 -------- arch/ia64/include/asm/processor.h | 17 ----------------- arch/m32r/include/asm/processor.h | 2 -- arch/m32r/kernel/process.c | 8 -------- arch/m68k/include/asm/processor.h | 2 -- arch/m68k/kernel/process.c | 14 -------------- arch/microblaze/include/asm/processor.h | 6 ------ arch/microblaze/kernel/process.c | 17 ----------------- arch/mn10300/include/asm/processor.h | 5 ----- arch/mn10300/kernel/process.c | 8 -------- arch/nios2/include/asm/processor.h | 3 --- arch/openrisc/include/asm/processor.h | 5 ----- arch/openrisc/kernel/process.c | 5 ----- arch/parisc/include/asm/processor.h | 5 ----- arch/parisc/kernel/process.c | 5 ----- arch/powerpc/include/asm/processor.h | 6 ------ arch/s390/include/asm/processor.h | 5 ----- arch/s390/kernel/process.c | 25 ------------------------- arch/score/include/asm/processor.h | 1 - arch/score/kernel/process.c | 5 ----- arch/sparc/include/asm/processor_32.h | 3 --- arch/sparc/include/asm/processor_64.h | 2 -- arch/sparc/kernel/process_32.c | 8 -------- arch/sparc/kernel/process_64.c | 19 ------------------- arch/tile/include/asm/processor.h | 7 ------- arch/um/include/asm/processor-generic.h | 2 -- arch/um/kernel/um_arch.c | 6 ------ arch/x86/include/asm/processor.h | 2 -- arch/x86/kernel/process.c | 11 ----------- arch/xtensa/include/asm/processor.h | 2 -- 41 files changed, 270 deletions(-) diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 6e1242da0159..4104a0839214 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -86,8 +86,6 @@ struct task_struct; #define TSK_K_BLINK(tsk) TSK_K_REG(tsk, 4) #define TSK_K_FP(tsk) TSK_K_REG(tsk, 0) -#define thread_saved_pc(tsk) TSK_K_BLINK(tsk) - extern void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long usp); diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h index 85d4af97c986..dbdbb8a558df 100644 --- a/arch/blackfin/include/asm/processor.h +++ b/arch/blackfin/include/asm/processor.h @@ -75,11 +75,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* - * Return saved PC of a blocked thread. - */ -#define thread_saved_pc(tsk) (tsk->thread.pc) - unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) \ diff --git a/arch/c6x/include/asm/processor.h b/arch/c6x/include/asm/processor.h index b9eb3da7f278..7c87b5be53b5 100644 --- a/arch/c6x/include/asm/processor.h +++ b/arch/c6x/include/asm/processor.h @@ -95,11 +95,6 @@ static inline void release_thread(struct task_struct *dead_task) #define copy_segments(tsk, mm) do { } while (0) #define release_segments(mm) do { } while (0) -/* - * saved PC of a blocked thread. - */ -#define thread_saved_pc(tsk) (task_pt_regs(tsk)->pc) - /* * saved kernel SP and DP of a blocked thread. */ diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index e299d30105b5..a2cdb1521aca 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -69,14 +69,6 @@ void hard_reset_now (void) while(1) /* waiting for RETRIBUTION! */ ; } -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *t) -{ - return task_pt_regs(t)->irp; -} - /* setup the child's kernel stack with a pt_regs and switch_stack on it. * it will be un-nested during _resume and _ret_from_sys_call when the * new thread is scheduled. diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index c530a8fa87ce..fe87b383fbf3 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -84,14 +84,6 @@ hard_reset_now(void) ; /* Wait for reset. */ } -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *t) -{ - return task_pt_regs(t)->erp; -} - /* * Setup the child's kernel stack with a pt_regs and call switch_stack() on it. * It will be unnested during _resume and _ret_from_sys_call when the new thread diff --git a/arch/cris/include/asm/processor.h b/arch/cris/include/asm/processor.h index 15b815df29c1..bc2729e4b2c9 100644 --- a/arch/cris/include/asm/processor.h +++ b/arch/cris/include/asm/processor.h @@ -52,8 +52,6 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->thread.usp) -extern unsigned long thread_saved_pc(struct task_struct *tsk); - /* Free all resources held by a thread. */ static inline void release_thread(struct task_struct *dead_task) { diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index ddaeb9cc9143..e4d08d74ed9f 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -96,11 +96,6 @@ extern asmlinkage void *restore_user_regs(const struct user_context *target, ... #define release_segments(mm) do { } while (0) #define forget_segments() do { } while (0) -/* - * Return saved PC of a blocked thread. - */ -extern unsigned long thread_saved_pc(struct task_struct *tsk); - unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) ((tsk)->thread.frame0->pc) diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 5a4c92abc99e..a957b374e3a6 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -198,15 +198,6 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - /* Check whether the thread is blocked in resume() */ - if (in_sched_functions(tsk->thread.pc)) - return ((unsigned long *)tsk->thread.fp)[2]; - else - return tsk->thread.pc; -} - int elf_check_arch(const struct elf32_hdr *hdr) { unsigned long hsr0 = __get_HSR(0); diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h index 65132d7ae9e5..afa53147e66a 100644 --- a/arch/h8300/include/asm/processor.h +++ b/arch/h8300/include/asm/processor.h @@ -110,10 +110,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk); unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) \ diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 0f5db5bb561b..d1ddcabbbe83 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -129,11 +129,6 @@ int copy_thread(unsigned long clone_flags, return 0; } -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((struct pt_regs *)tsk->thread.esp0)->pc; -} - unsigned long get_wchan(struct task_struct *p) { unsigned long fp, pc; diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 45a825402f63..ce67940860a5 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -33,9 +33,6 @@ /* task_struct, defined elsewhere, is the "process descriptor" */ struct task_struct; -/* this is defined in arch/process.c */ -extern unsigned long thread_saved_pc(struct task_struct *tsk); - extern void start_thread(struct pt_regs *, unsigned long, unsigned long); /* diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index de715bab7956..656050c2e6a0 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -60,14 +60,6 @@ void arch_cpu_idle(void) local_irq_enable(); } -/* - * Return saved PC of a blocked thread - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return 0; -} - /* * Copy architecture-specific thread state */ diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 26a63d69c599..ab982f07ea68 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -601,23 +601,6 @@ ia64_set_unat (__u64 *unat, void *spill_addr, unsigned long nat) *unat = (*unat & ~mask) | (nat << bit); } -/* - * Return saved PC of a blocked thread. - * Note that the only way T can block is through a call to schedule() -> switch_to(). - */ -static inline unsigned long -thread_saved_pc (struct task_struct *t) -{ - struct unw_frame_info info; - unsigned long ip; - - unw_init_from_blocked_task(&info, t); - if (unw_unwind(&info) < 0) - return 0; - unw_get_ip(&info, &ip); - return ip; -} - /* * Get the current instruction/program counter value. */ diff --git a/arch/m32r/include/asm/processor.h b/arch/m32r/include/asm/processor.h index 5767367550c6..657874eeeccc 100644 --- a/arch/m32r/include/asm/processor.h +++ b/arch/m32r/include/asm/processor.h @@ -122,8 +122,6 @@ extern void release_thread(struct task_struct *); extern void copy_segments(struct task_struct *p, struct mm_struct * mm); extern void release_segments(struct mm_struct * mm); -extern unsigned long thread_saved_pc(struct task_struct *); - /* Copy and release all segment info associated with a VM */ #define copy_segments(p, mm) do { } while (0) #define release_segments(mm) do { } while (0) diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index d8ffcfec599c..8cd7e03f4370 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -39,14 +39,6 @@ #include -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return tsk->thread.lr; -} - void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index 77239e81379b..94c36030440c 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -130,8 +130,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -extern unsigned long thread_saved_pc(struct task_struct *tsk); - unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) \ diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index e475c945c8b2..7df92f8b0781 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -40,20 +40,6 @@ asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); - -/* - * Return saved PC from a blocked thread - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; - /* Check whether the thread is blocked in resume() */ - if (in_sched_functions(sw->retpc)) - return ((unsigned long *)sw->a6)[1]; - else - return sw->retpc; -} - void arch_cpu_idle(void) { #if defined(MACH_ATARI_ONLY) diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 37ef196e4519..330d556860ba 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -69,8 +69,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -extern unsigned long thread_saved_pc(struct task_struct *t); - extern unsigned long get_wchan(struct task_struct *p); # define KSTK_EIP(tsk) (0) @@ -121,10 +119,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* Return saved (kernel) PC of a blocked thread. */ -# define thread_saved_pc(tsk) \ - ((tsk)->thread.regs ? (tsk)->thread.regs->r15 : 0) - unsigned long get_wchan(struct task_struct *p); /* The size allocated for kernel stacks. This _must_ be a power of two! */ diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index e92a817e645f..6527ec22f158 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -119,23 +119,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } -#ifndef CONFIG_MMU -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - struct cpu_context *ctx = - &(((struct thread_info *)(tsk->stack))->cpu_context); - - /* Check whether the thread is blocked in resume() */ - if (in_sched_functions(ctx->r15)) - return (unsigned long)ctx->r15; - else - return ctx->r14; -} -#endif - unsigned long get_wchan(struct task_struct *p) { /* TBD (used by procfs) */ diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h index 18e17abf7664..3ae479117b42 100644 --- a/arch/mn10300/include/asm/processor.h +++ b/arch/mn10300/include/asm/processor.h @@ -132,11 +132,6 @@ static inline void start_thread(struct pt_regs *regs, /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* - * Return saved PC of a blocked thread. - */ -extern unsigned long thread_saved_pc(struct task_struct *tsk); - unsigned long get_wchan(struct task_struct *p); #define task_pt_regs(task) ((task)->thread.uregs) diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index c9fa42619c6a..89e8027e07fb 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -39,14 +39,6 @@ #include #include "internal.h" -/* - * return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((unsigned long *) tsk->thread.sp)[3]; -} - /* * power off function, if any */ diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index 3bbbc3d798e5..4944e2e1d8b0 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -75,9 +75,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* Return saved PC of a blocked thread. */ -#define thread_saved_pc(tsk) ((tsk)->thread.kregs->ea) - extern unsigned long get_wchan(struct task_struct *p); #define task_pt_regs(p) \ diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index a908e6c30a00..396d8f306c21 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -84,11 +84,6 @@ void start_thread(struct pt_regs *regs, unsigned long nip, unsigned long sp); void release_thread(struct task_struct *); unsigned long get_wchan(struct task_struct *p); -/* - * Return saved PC of a blocked thread. For now, this is the "user" PC - */ -extern unsigned long thread_saved_pc(struct task_struct *t); - #define init_stack (init_thread_union.stack) #define cpu_relax() barrier() diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 106859ae27ff..f9b77003f113 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -110,11 +110,6 @@ void show_regs(struct pt_regs *regs) show_registers(regs); } -unsigned long thread_saved_pc(struct task_struct *t) -{ - return (unsigned long)user_regs(t->stack)->pc; -} - void release_thread(struct task_struct *dead_task) { } diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index a3661ee6b060..4c6694b4e77e 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -163,12 +163,7 @@ struct thread_struct { .flags = 0 \ } -/* - * Return saved PC of a blocked thread. This is used by ps mostly. - */ - struct task_struct; -unsigned long thread_saved_pc(struct task_struct *t); void show_trace(struct task_struct *task, unsigned long *stack); /* diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 4516a5b53f38..b64d7d21646e 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -239,11 +239,6 @@ copy_thread(unsigned long clone_flags, unsigned long usp, return 0; } -unsigned long thread_saved_pc(struct task_struct *t) -{ - return t->thread.regs.kpc; -} - unsigned long get_wchan(struct task_struct *p) { diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index bb99b651085a..1189d04f3bd1 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -378,12 +378,6 @@ struct thread_struct { } #endif -/* - * Return saved PC of a blocked thread. For now, this is the "user" PC - */ -#define thread_saved_pc(tsk) \ - ((tsk)->thread.regs? (tsk)->thread.regs->nip: 0) - #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.regs) unsigned long get_wchan(struct task_struct *p); diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 60d395fdc864..aeac013968f2 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -221,11 +221,6 @@ extern void release_thread(struct task_struct *); /* Free guarded storage control block for current */ void exit_thread_gs(void); -/* - * Return saved PC of a blocked thread. - */ -extern unsigned long thread_saved_pc(struct task_struct *t); - unsigned long get_wchan(struct task_struct *p); #define task_pt_regs(tsk) ((struct pt_regs *) \ (task_stack_page(tsk) + THREAD_SIZE) - 1) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 999d7154bbdc..bb32b8618bf6 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -41,31 +41,6 @@ asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); -/* - * Return saved PC of a blocked thread. used in kernel/sched. - * resume in entry.S does not create a new stack frame, it - * just stores the registers %r6-%r15 to the frame given by - * schedule. We want to return the address of the caller of - * schedule, so we have to walk the backchain one time to - * find the frame schedule() store its return address. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - struct stack_frame *sf, *low, *high; - - if (!tsk || !task_stack_page(tsk)) - return 0; - low = task_stack_page(tsk); - high = (struct stack_frame *) task_pt_regs(tsk); - sf = (struct stack_frame *) tsk->thread.ksp; - if (sf <= low || sf > high) - return 0; - sf = (struct stack_frame *) sf->back_chain; - if (sf <= low || sf > high) - return 0; - return sf->gprs[8]; -} - extern void kernel_thread_starter(void); /* diff --git a/arch/score/include/asm/processor.h b/arch/score/include/asm/processor.h index d9a922d8711b..299274581968 100644 --- a/arch/score/include/asm/processor.h +++ b/arch/score/include/asm/processor.h @@ -13,7 +13,6 @@ struct task_struct; */ extern void (*cpu_wait)(void); -extern unsigned long thread_saved_pc(struct task_struct *tsk); extern void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp); extern unsigned long get_wchan(struct task_struct *p); diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index eb64d7a677cb..6e20241a1ed4 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -101,11 +101,6 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *r) return 1; } -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return task_pt_regs(tsk)->cp0_epc; -} - unsigned long get_wchan(struct task_struct *task) { if (!task || task == current || task->state == TASK_RUNNING) diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index dd27159819eb..b395e5620c0b 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -67,9 +67,6 @@ struct thread_struct { .current_ds = KERNEL_DS, \ } -/* Return saved PC of a blocked thread. */ -unsigned long thread_saved_pc(struct task_struct *t); - /* Do necessary setup to start up a newly executed thread. */ static inline void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index b58ee9018433..f04dc5a43062 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -89,9 +89,7 @@ struct thread_struct { #include #include -/* Return saved PC of a blocked thread. */ struct task_struct; -unsigned long thread_saved_pc(struct task_struct *); /* On Uniprocessor, even in RMO processes see TSO semantics */ #ifdef CONFIG_SMP diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index b6dac8e980f0..9245f93398c7 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -176,14 +176,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp) printk("\n"); } -/* - * Note: sparc64 has a pretty intricated thread_saved_pc, check it out. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return task_thread_info(tsk)->kpc; -} - /* * Free current thread data structures etc.. */ diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 1badc493e62e..b96104da5bd6 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -400,25 +400,6 @@ core_initcall(sparc_sysrq_init); #endif -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - struct thread_info *ti = task_thread_info(tsk); - unsigned long ret = 0xdeadbeefUL; - - if (ti && ti->ksp) { - unsigned long *sp; - sp = (unsigned long *)(ti->ksp + STACK_BIAS); - if (((unsigned long)sp & (sizeof(long) - 1)) == 0UL && - sp[14]) { - unsigned long *fp; - fp = (unsigned long *)(sp[14] + STACK_BIAS); - if (((unsigned long)fp & (sizeof(long) - 1)) == 0UL) - ret = fp[15]; - } - } - return ret; -} - /* Free current thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h index 0bc9968b97a1..f71e5206650b 100644 --- a/arch/tile/include/asm/processor.h +++ b/arch/tile/include/asm/processor.h @@ -214,13 +214,6 @@ static inline void release_thread(struct task_struct *dead_task) extern void prepare_exit_to_usermode(struct pt_regs *regs, u32 flags); - -/* - * Return saved (kernel) PC of a blocked thread. - * Only used in a printk() in kernel/sched/core.c, so don't work too hard. - */ -#define thread_saved_pc(t) ((t)->thread.pc) - unsigned long get_wchan(struct task_struct *p); /* Return initial ksp value for given task. */ diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 2d1e0dd5bb0b..f6d1a3f747a9 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -58,8 +58,6 @@ static inline void release_thread(struct task_struct *task) { } -extern unsigned long thread_saved_pc(struct task_struct *t); - static inline void mm_copy_segments(struct mm_struct *from_mm, struct mm_struct *new_mm) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 64a1fd06f3fd..7b5640117325 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -56,12 +56,6 @@ union thread_union cpu0_irqstack __attribute__((__section__(".data..init_irqstack"))) = { INIT_THREAD_INFO(init_task) }; -unsigned long thread_saved_pc(struct task_struct *task) -{ - /* FIXME: Need to look up userspace_pid by cpu */ - return os_process_pc(userspace_pid[0]); -} - /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada998a402..a28b671f1549 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -860,8 +860,6 @@ extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ -extern unsigned long thread_saved_pc(struct task_struct *tsk); - extern void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0bb88428cbf2..3ca198080ea9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -544,17 +544,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) return randomize_page(mm->brk, 0x02000000); } -/* - * Return saved PC of a blocked thread. - * What is this good for? it will be always the scheduler or ret_from_fork. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - struct inactive_task_frame *frame = - (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); - return READ_ONCE_NOCHECK(frame->ret_addr); -} - /* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 003eeee3fbc6..30ee8c608853 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -213,8 +213,6 @@ struct mm_struct; #define release_segments(mm) do { } while(0) #define forget_segments() do { } while (0) -#define thread_saved_pc(tsk) (task_pt_regs(tsk)->pc) - extern unsigned long get_wchan(struct task_struct *p); #define KSTK_EIP(tsk) (task_pt_regs(tsk)->pc) From 713a98d90c5ea072c1bb00ef40617aee2cef2232 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 28 Jun 2017 09:51:03 +0800 Subject: [PATCH 138/149] virtio-net: serialize tx routine during reset We don't hold any tx lock when trying to disable TX during reset, this would lead a use after free since ndo_start_xmit() tries to access the virtqueue which has already been freed. Fix this by using netif_tx_disable() before freeing the vqs, this could make sure no tx after vq freeing. Reported-by: Jean-Philippe Menil Tested-by: Jean-Philippe Menil Fixes commit f600b6905015 ("virtio_net: Add XDP support") Cc: John Fastabend Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Acked-by: Robert McCabe Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a871f45ecc79..143d8a95a60d 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1797,6 +1797,7 @@ static void virtnet_freeze_down(struct virtio_device *vdev) flush_work(&vi->config_work); netif_device_detach(vi->dev); + netif_tx_disable(vi->dev); cancel_delayed_work_sync(&vi->refill); if (netif_running(vi->dev)) { From c1a4872ebfb83b1af7144f7b29ac8c4b344a12a8 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 28 Jun 2017 12:53:54 +0800 Subject: [PATCH 139/149] net: sched: Fix one possible panic when no destroy callback When qdisc fail to init, qdisc_create would invoke the destroy callback to cleanup. But there is no check if the callback exists really. So it would cause the panic if there is no real destroy callback like the qdisc codel, fq, and so on. Take codel as an example following: When a malicious user constructs one invalid netlink msg, it would cause codel_init->codel_change->nla_parse_nested failed. Then kernel would invoke the destroy callback directly but qdisc codel doesn't define one. It causes one panic as a result. Now add one the check for destroy to avoid the possible panic. Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Signed-off-by: Gao Feng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index e88342fde1bc..cfdbfa18a95e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1019,7 +1019,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev, return sch; } /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ - ops->destroy(sch); + if (ops->destroy) + ops->destroy(sch); err_out3: dev_put(dev); kfree((char *) sch - sch->padded); From 6b27c8adf27edf1dabe2cdcfaa101ef7e2712415 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 28 Jun 2017 09:03:12 +0300 Subject: [PATCH 140/149] mlxsw: spectrum_router: Fix NULL pointer dereference In case a VLAN device is enslaved to a bridge we shouldn't create a router interface (RIF) for it when it's configured with an IP address. This is already handled by the driver for other types of netdevs, such as physical ports and LAG devices. If this IP address is then removed and the interface is subsequently unlinked from the bridge, a NULL pointer dereference can happen, as the original 802.1d FID was replaced with an rFID which was then deleted. To reproduce: $ ip link set dev enp3s0np9 up $ ip link add name enp3s0np9.111 link enp3s0np9 type vlan id 111 $ ip link set dev enp3s0np9.111 up $ ip link add name br0 type bridge $ ip link set dev br0 up $ ip link set enp3s0np9.111 master br0 $ ip address add dev enp3s0np9.111 192.168.0.1/24 $ ip address del dev enp3s0np9.111 192.168.0.1/24 $ ip link set dev enp3s0np9.111 nomaster Fixes: 99724c18fc66 ("mlxsw: spectrum: Introduce support for router interfaces") Signed-off-by: Ido Schimmel Reported-by: Petr Machata Tested-by: Petr Machata Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 9f89c4137d21..0744452a0b18 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3334,6 +3334,9 @@ static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev, struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(vlan_dev); u16 vid = vlan_dev_vlan_id(vlan_dev); + if (netif_is_bridge_port(vlan_dev)) + return 0; + if (mlxsw_sp_port_dev_check(real_dev)) return mlxsw_sp_inetaddr_vport_event(vlan_dev, real_dev, event, vid); From acb4b7df48b539cb391287921de57e4e5fae3460 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 28 Jun 2017 14:44:21 +0300 Subject: [PATCH 141/149] rocker: move dereference before free My static checker complains that ofdpa_neigh_del() can sometimes free "found". It just makes sense to use it first before deleting it. Fixes: ecf244f753e0 ("rocker: fix maybe-uninitialized warning") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_ofdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 2ae852454780..a9ce82d3e9cf 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -1505,8 +1505,8 @@ static int ofdpa_port_ipv4_nh(struct ofdpa_port *ofdpa_port, *index = entry->index; resolved = false; } else if (removing) { - ofdpa_neigh_del(trans, found); *index = found->index; + ofdpa_neigh_del(trans, found); } else if (updating) { ofdpa_neigh_update(found, trans, NULL, false); resolved = !is_zero_ether_addr(found->eth_dst); From 5b85840320151f61e04d83a23ef2567a07094503 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Wed, 28 Jun 2017 18:28:33 +0200 Subject: [PATCH 142/149] arcnet: change irq handler to lock irqsave This patch prevents the arcnet driver from the following deadlock. [ 41.273910] ====================================================== [ 41.280397] [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] [ 41.287433] 4.4.0-00034-gc0ae784 #536 Not tainted [ 41.292366] ------------------------------------------------------ [ 41.298863] arcecho/233 [HC0[0]:SC0[2]:HE0:SE0] is trying to acquire: [ 41.305628] (&(&lp->lock)->rlock){+.+...}, at: [] arcnet_send_packet+0x60/0x1c0 [arcnet] [ 41.315199] [ 41.315199] and this task is already holding: [ 41.321324] (_xmit_ARCNET#2){+.-...}, at: [] packet_direct_xmit+0xfc/0x1c8 [ 41.329593] which would create a new lock dependency: [ 41.334893] (_xmit_ARCNET#2){+.-...} -> (&(&lp->lock)->rlock){+.+...} [ 41.341801] [ 41.341801] but this new dependency connects a SOFTIRQ-irq-safe lock: [ 41.350108] (_xmit_ARCNET#2){+.-...} ... which became SOFTIRQ-irq-safe at: [ 41.357539] [] _raw_spin_lock+0x30/0x40 [ 41.362677] [] dev_watchdog+0x5c/0x264 [ 41.367723] [] call_timer_fn+0x6c/0xf4 [ 41.372759] [] run_timer_softirq+0x154/0x210 [ 41.378340] [] __do_softirq+0x144/0x298 [ 41.383469] [] irq_exit+0xcc/0x130 [ 41.388138] [] __handle_domain_irq+0x60/0xb4 [ 41.393728] [] __irq_svc+0x58/0x78 [ 41.398402] [] arch_cpu_idle+0x24/0x3c [ 41.403443] [] cpu_startup_entry+0x1f8/0x25c [ 41.409029] [] start_kernel+0x3c0/0x3cc [ 41.414170] [ 41.414170] to a SOFTIRQ-irq-unsafe lock: [ 41.419931] (&(&lp->lock)->rlock){+.+...} ... which became SOFTIRQ-irq-unsafe at: [ 41.427996] ... [] _raw_spin_lock+0x30/0x40 [ 41.433409] [] arcnet_interrupt+0x2c/0x800 [arcnet] [ 41.439646] [] handle_nested_irq+0x8c/0xec [ 41.445063] [] regmap_irq_thread+0x190/0x314 [ 41.450661] [] irq_thread_fn+0x1c/0x34 [ 41.455700] [] irq_thread+0x13c/0x1dc [ 41.460649] [] kthread+0xe4/0xf8 [ 41.465158] [] ret_from_fork+0x14/0x24 [ 41.470207] [ 41.470207] other info that might help us debug this: [ 41.470207] [ 41.478627] Possible interrupt unsafe locking scenario: [ 41.478627] [ 41.485763] CPU0 CPU1 [ 41.490521] ---- ---- [ 41.495279] lock(&(&lp->lock)->rlock); [ 41.499414] local_irq_disable(); [ 41.505636] lock(_xmit_ARCNET#2); [ 41.511967] lock(&(&lp->lock)->rlock); [ 41.518741] [ 41.521490] lock(_xmit_ARCNET#2); [ 41.525356] [ 41.525356] *** DEADLOCK *** [ 41.525356] [ 41.531587] 1 lock held by arcecho/233: [ 41.535617] #0: (_xmit_ARCNET#2){+.-...}, at: [] packet_direct_xmit+0xfc/0x1c8 [ 41.544355] the dependencies between SOFTIRQ-irq-safe lock and the holding lock: [ 41.552362] -> (_xmit_ARCNET#2){+.-...} ops: 27 { [ 41.557357] HARDIRQ-ON-W at: [ 41.560664] [] _raw_spin_lock+0x30/0x40 [ 41.567445] [] dev_deactivate_many+0x114/0x304 [ 41.574866] [] dev_deactivate+0x24/0x38 [ 41.581646] [] linkwatch_do_dev+0x40/0x74 [ 41.588613] [] __linkwatch_run_queue+0xec/0x140 [ 41.596120] [] linkwatch_event+0x2c/0x34 [ 41.602991] [] process_one_work+0x188/0x40c [ 41.610131] [] worker_thread+0x4c/0x480 [ 41.616912] [] kthread+0xe4/0xf8 [ 41.623048] [] ret_from_fork+0x14/0x24 [ 41.629735] IN-SOFTIRQ-W at: [ 41.633039] [] _raw_spin_lock+0x30/0x40 [ 41.639820] [] dev_watchdog+0x5c/0x264 [ 41.646508] [] call_timer_fn+0x6c/0xf4 [ 41.653190] [] run_timer_softirq+0x154/0x210 [ 41.660425] [] __do_softirq+0x144/0x298 [ 41.667201] [] irq_exit+0xcc/0x130 [ 41.673518] [] __handle_domain_irq+0x60/0xb4 [ 41.680754] [] __irq_svc+0x58/0x78 [ 41.687077] [] arch_cpu_idle+0x24/0x3c [ 41.693769] [] cpu_startup_entry+0x1f8/0x25c [ 41.701006] [] start_kernel+0x3c0/0x3cc [ 41.707791] INITIAL USE at: [ 41.711003] [] _raw_spin_lock+0x30/0x40 [ 41.717696] [] dev_deactivate_many+0x114/0x304 [ 41.725026] [] dev_deactivate+0x24/0x38 [ 41.731718] [] linkwatch_do_dev+0x40/0x74 [ 41.738593] [] __linkwatch_run_queue+0xec/0x140 [ 41.746011] [] linkwatch_event+0x2c/0x34 [ 41.752789] [] process_one_work+0x188/0x40c [ 41.759847] [] worker_thread+0x4c/0x480 [ 41.766541] [] kthread+0xe4/0xf8 [ 41.772596] [] ret_from_fork+0x14/0x24 [ 41.779198] } [ 41.780945] ... key at: [] netdev_xmit_lock_key+0x38/0x1c8 [ 41.788192] ... acquired at: [ 41.791309] [] lock_acquire+0x70/0x90 [ 41.796361] [] _raw_spin_lock_irqsave+0x40/0x54 [ 41.802324] [] arcnet_send_packet+0x60/0x1c0 [arcnet] [ 41.808844] [] packet_direct_xmit+0x130/0x1c8 [ 41.814622] [] packet_sendmsg+0x3b8/0x680 [ 41.820034] [] sock_sendmsg+0x14/0x24 [ 41.825091] [] SyS_sendto+0xb8/0xe0 [ 41.829956] [] SyS_send+0x18/0x20 [ 41.834638] [] ret_fast_syscall+0x0/0x1c [ 41.839954] [ 41.841514] the dependencies between the lock to be acquired and SOFTIRQ-irq-unsafe lock: [ 41.850302] -> (&(&lp->lock)->rlock){+.+...} ops: 5 { [ 41.855644] HARDIRQ-ON-W at: [ 41.858945] [] _raw_spin_lock+0x30/0x40 [ 41.865726] [] arcnet_interrupt+0x2c/0x800 [arcnet] [ 41.873607] [] handle_nested_irq+0x8c/0xec [ 41.880666] [] regmap_irq_thread+0x190/0x314 [ 41.887901] [] irq_thread_fn+0x1c/0x34 [ 41.894593] [] irq_thread+0x13c/0x1dc [ 41.901195] [] kthread+0xe4/0xf8 [ 41.907338] [] ret_from_fork+0x14/0x24 [ 41.914025] SOFTIRQ-ON-W at: [ 41.917328] [] _raw_spin_lock+0x30/0x40 [ 41.924106] [] arcnet_interrupt+0x2c/0x800 [arcnet] [ 41.931981] [] handle_nested_irq+0x8c/0xec [ 41.939028] [] regmap_irq_thread+0x190/0x314 [ 41.946264] [] irq_thread_fn+0x1c/0x34 [ 41.952954] [] irq_thread+0x13c/0x1dc [ 41.959548] [] kthread+0xe4/0xf8 [ 41.965689] [] ret_from_fork+0x14/0x24 [ 41.972379] INITIAL USE at: [ 41.975595] [] _raw_spin_lock+0x30/0x40 [ 41.982283] [] arcnet_interrupt+0x2c/0x800 [arcnet] [ 41.990063] [] handle_nested_irq+0x8c/0xec [ 41.997027] [] regmap_irq_thread+0x190/0x314 [ 42.004172] [] irq_thread_fn+0x1c/0x34 [ 42.010766] [] irq_thread+0x13c/0x1dc [ 42.017267] [] kthread+0xe4/0xf8 [ 42.023314] [] ret_from_fork+0x14/0x24 [ 42.029903] } [ 42.031648] ... key at: [] __key.42091+0x0/0xfffff0f8 [arcnet] [ 42.039255] ... acquired at: [ 42.042372] [] lock_acquire+0x70/0x90 [ 42.047413] [] _raw_spin_lock_irqsave+0x40/0x54 [ 42.053364] [] arcnet_send_packet+0x60/0x1c0 [arcnet] [ 42.059872] [] packet_direct_xmit+0x130/0x1c8 [ 42.065634] [] packet_sendmsg+0x3b8/0x680 [ 42.071030] [] sock_sendmsg+0x14/0x24 [ 42.076069] [] SyS_sendto+0xb8/0xe0 [ 42.080926] [] SyS_send+0x18/0x20 [ 42.085601] [] ret_fast_syscall+0x0/0x1c [ 42.090918] [ 42.092481] [ 42.092481] stack backtrace: [ 42.097065] CPU: 0 PID: 233 Comm: arcecho Not tainted 4.4.0-00034-gc0ae784 #536 [ 42.104751] Hardware name: Generic AM33XX (Flattened Device Tree) [ 42.111183] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 42.119337] [] (show_stack) from [] (dump_stack+0x8c/0x9c) [ 42.126937] [] (dump_stack) from [] (check_usage+0x4bc/0x63c) [ 42.134815] [] (check_usage) from [] (check_irq_usage+0x58/0xb0) [ 42.142964] [] (check_irq_usage) from [] (__lock_acquire+0x1524/0x20b0) [ 42.151740] [] (__lock_acquire) from [] (lock_acquire+0x70/0x90) [ 42.159886] [] (lock_acquire) from [] (_raw_spin_lock_irqsave+0x40/0x54) [ 42.168768] [] (_raw_spin_lock_irqsave) from [] (arcnet_send_packet+0x60/0x1c0 [arcnet]) [ 42.179115] [] (arcnet_send_packet [arcnet]) from [] (packet_direct_xmit+0x130/0x1c8) [ 42.189182] [] (packet_direct_xmit) from [] (packet_sendmsg+0x3b8/0x680) [ 42.198059] [] (packet_sendmsg) from [] (sock_sendmsg+0x14/0x24) [ 42.206199] [] (sock_sendmsg) from [] (SyS_sendto+0xb8/0xe0) [ 42.213978] [] (SyS_sendto) from [] (SyS_send+0x18/0x20) [ 42.221388] [] (SyS_send) from [] (ret_fast_syscall+0x0/0x1c) Signed-off-by: Michael Grzeschik --- v1 -> v2: removed unneeded zero assignment of flags Signed-off-by: David S. Miller --- drivers/net/arcnet/arcnet.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index 62ee439d5882..53a1cb551def 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -756,6 +756,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) struct net_device *dev = dev_id; struct arcnet_local *lp; int recbuf, status, diagstatus, didsomething, boguscount; + unsigned long flags; int retval = IRQ_NONE; arc_printk(D_DURING, dev, "\n"); @@ -765,7 +766,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) lp = netdev_priv(dev); BUG_ON(!lp); - spin_lock(&lp->lock); + spin_lock_irqsave(&lp->lock, flags); /* RESET flag was enabled - if device is not running, we must * clear it right away (but nothing else). @@ -774,7 +775,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) if (lp->hw.status(dev) & RESETflag) lp->hw.command(dev, CFLAGScmd | RESETclear); lp->hw.intmask(dev, 0); - spin_unlock(&lp->lock); + spin_unlock_irqrestore(&lp->lock, flags); return retval; } @@ -998,7 +999,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) udelay(1); lp->hw.intmask(dev, lp->intmask); - spin_unlock(&lp->lock); + spin_unlock_irqrestore(&lp->lock, flags); return retval; } EXPORT_SYMBOL(arcnet_interrupt); From 06908d7aee8d62a80cabfd134d0354dc4d2794bc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 28 Jun 2017 18:28:34 +0200 Subject: [PATCH 143/149] Trivial fix to spelling mistake in arc_printk message Signed-off-by: Colin Ian King Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/capmode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/arcnet/capmode.c b/drivers/net/arcnet/capmode.c index 2056878fb087..4fa2e46b48d3 100644 --- a/drivers/net/arcnet/capmode.c +++ b/drivers/net/arcnet/capmode.c @@ -212,7 +212,7 @@ static int ack_tx(struct net_device *dev, int acked) ackpkt->soft.cap.proto = 0; /* using protocol 0 for acknowledge */ ackpkt->soft.cap.mes.ack = acked; - arc_printk(D_PROTO, dev, "Ackknowledge for cap packet %x.\n", + arc_printk(D_PROTO, dev, "Acknowledge for cap packet %x.\n", *((int *)&ackpkt->soft.cap.cookie[0])); ackskb->protocol = cpu_to_be16(ETH_P_ARCNET); From 0d494fcf867f95040b5b67e4bc5af739bcda37da Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Wed, 28 Jun 2017 18:28:35 +0200 Subject: [PATCH 144/149] arcnet: com20020: remove needless base_addr assignment The assignment is superfluous. Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/arcnet/com20020.c b/drivers/net/arcnet/com20020.c index 13d9ad4b3f5c..78043a9c5981 100644 --- a/drivers/net/arcnet/com20020.c +++ b/drivers/net/arcnet/com20020.c @@ -246,8 +246,6 @@ int com20020_found(struct net_device *dev, int shared) return -ENODEV; } - dev->base_addr = ioaddr; - arc_printk(D_NORMAL, dev, "%s: station %02Xh found at %03lXh, IRQ %d.\n", lp->card_name, dev->dev_addr[0], dev->base_addr, dev->irq); From cb108619f2fc77846bf7a7543517f3487f455b24 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Wed, 28 Jun 2017 18:28:36 +0200 Subject: [PATCH 145/149] arcnet: com20020-pci: fix dev_id calculation The dev_id was miscalculated. Only the two bits 4-5 are relevant for the MA1 card. PCIARC1 and PCIFB2 use the four bits 4-7 for id selection. Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index 239de38fbd6a..fec2df2c869f 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -135,6 +135,7 @@ static int com20020pci_probe(struct pci_dev *pdev, for (i = 0; i < ci->devcount; i++) { struct com20020_pci_channel_map *cm = &ci->chan_map_tbl[i]; struct com20020_dev *card; + int dev_id_mask = 0xf; dev = alloc_arcdev(device); if (!dev) { @@ -179,8 +180,8 @@ static int com20020pci_probe(struct pci_dev *pdev, /* Get the dev_id from the PLX rotary coder */ if (!strncmp(ci->name, "EAE PLX-PCI MA1", 15)) - dev->dev_id = 0xc; - dev->dev_id ^= inb(priv->misc + ci->rotary) >> 4; + dev_id_mask = 0x3; + dev->dev_id = (inb(priv->misc + ci->rotary) >> 4) & dev_id_mask; snprintf(dev->name, sizeof(dev->name), "arc%d-%d", dev->dev_id, i); From 2a0ea04c83ab82c3852c9171d2fa5cd9a1432c9b Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Wed, 28 Jun 2017 18:28:37 +0200 Subject: [PATCH 146/149] arcnet: com20020-pci: add missing pdev setup in netdev structure We add the pdev data to the pci devices netdev structure. This way the interface get consistent device names in the userspace (udev). Signed-off-by: Michael Grzeschik Signed-off-by: David S. Miller --- drivers/net/arcnet/com20020-pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index fec2df2c869f..47f80b83dcf4 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -167,6 +167,7 @@ static int com20020pci_probe(struct pci_dev *pdev, arcnet_outb(0x00, ioaddr, COM20020_REG_W_COMMAND); arcnet_inb(ioaddr, COM20020_REG_R_DIAGSTAT); + SET_NETDEV_DEV(dev, &pdev->dev); dev->base_addr = ioaddr; dev->dev_addr[0] = node; dev->irq = pdev->irq; From 6bdf6abc56b53103324dfd270a86580306e1a232 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Jun 2017 03:04:59 +0200 Subject: [PATCH 147/149] bpf: prevent leaking pointer via xadd on unpriviledged Leaking kernel addresses on unpriviledged is generally disallowed, for example, verifier rejects the following: 0: (b7) r0 = 0 1: (18) r2 = 0xffff897e82304400 3: (7b) *(u64 *)(r1 +48) = r2 R2 leaks addr into ctx Doing pointer arithmetic on them is also forbidden, so that they don't turn into unknown value and then get leaked out. However, there's xadd as a special case, where we don't check the src reg for being a pointer register, e.g. the following will pass: 0: (b7) r0 = 0 1: (7b) *(u64 *)(r1 +48) = r0 2: (18) r2 = 0xffff897e82304400 ; map 4: (db) lock *(u64 *)(r1 +48) += r2 5: (95) exit We could store the pointer into skb->cb, loose the type context, and then read it out from there again to leak it eventually out of a map value. Or more easily in a different variant, too: 0: (bf) r6 = r1 1: (7a) *(u64 *)(r10 -8) = 0 2: (bf) r2 = r10 3: (07) r2 += -8 4: (18) r1 = 0x0 6: (85) call bpf_map_lookup_elem#1 7: (15) if r0 == 0x0 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R6=ctx R10=fp 8: (b7) r3 = 0 9: (7b) *(u64 *)(r0 +0) = r3 10: (db) lock *(u64 *)(r0 +0) += r6 11: (b7) r0 = 0 12: (95) exit from 7 to 11: R0=inv,min_value=0,max_value=0 R6=ctx R10=fp 11: (b7) r0 = 0 12: (95) exit Prevent this by checking xadd src reg for pointer types. Also add a couple of test cases related to this. Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs") Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Edward Cree Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 ++ tools/testing/selftests/bpf/test_verifier.c | 66 +++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 339c8a1371de..a8a725697bed 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -989,6 +989,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d leaks addr into mem\n", insn->src_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index cabb19b1e371..0ff8c55c0464 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -3748,6 +3748,72 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "invalid bpf_context access", }, + { + "leak pointer into ctx 1", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_2, + offsetof(struct __sk_buff, cb[0])), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 2 }, + .errstr_unpriv = "R2 leaks addr into mem", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "leak pointer into ctx 2", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, cb[0])), + BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_10, + offsetof(struct __sk_buff, cb[0])), + BPF_EXIT_INSN(), + }, + .errstr_unpriv = "R10 leaks addr into mem", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "leak pointer into ctx 3", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_LD_MAP_FD(BPF_REG_2, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, + offsetof(struct __sk_buff, cb[0])), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 1 }, + .errstr_unpriv = "R2 leaks addr into ctx", + .result_unpriv = REJECT, + .result = ACCEPT, + }, + { + "leak pointer into map val", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), + BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .errstr_unpriv = "R6 leaks addr into mem", + .result_unpriv = REJECT, + .result = ACCEPT, + }, { "helper access to map: full range", .insns = { From e44699d2c28067f69698ccb68dd3ddeacfebc434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Thu, 29 Jun 2017 11:13:36 +0200 Subject: [PATCH 148/149] net: handle NAPI_GRO_FREE_STOLEN_HEAD case also in napi_frags_finish() Recently I started seeing warnings about pages with refcount -1. The problem was traced to packets being reused after their head was merged into a GRO packet by skb_gro_receive(). While bisecting the issue pointed to commit c21b48cc1bbf ("net: adjust skb->truesize in ___pskb_trim()") and I have never seen it on a kernel with it reverted, I believe the real problem appeared earlier when the option to merge head frag in GRO was implemented. Handling NAPI_GRO_FREE_STOLEN_HEAD state was only added to GRO_MERGED_FREE branch of napi_skb_finish() so that if the driver uses napi_gro_frags() and head is merged (which in my case happens after the skb_condense() call added by the commit mentioned above), the skb is reused including the head that has been merged. As a result, we release the page reference twice and eventually end up with negative page refcount. To fix the problem, handle NAPI_GRO_FREE_STOLEN_HEAD in napi_frags_finish() the same way it's done in napi_skb_finish(). Fixes: d7e8883cfcf4 ("net: make GRO aware of skb->head_frag") Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/core/dev.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 91bb55070533..416137c64bf8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4767,6 +4767,13 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); +static void napi_skb_free_stolen_head(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); + kmem_cache_free(skbuff_head_cache, skb); +} + static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { @@ -4780,13 +4787,10 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) break; case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { - skb_dst_drop(skb); - secpath_reset(skb); - kmem_cache_free(skbuff_head_cache, skb); - } else { + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else __kfree_skb(skb); - } break; case GRO_HELD: @@ -4858,10 +4862,16 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, break; case GRO_DROP: - case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else + napi_reuse_skb(napi, skb); + break; + case GRO_MERGED: case GRO_CONSUMED: break; From d58299a478c416c0b48e4b31c6332fe7beb63000 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 29 Jun 2017 16:50:06 +0100 Subject: [PATCH 149/149] sfc: fix attempt to translate invalid filter ID When filter insertion fails with no rollback, we were trying to convert EFX_EF10_FILTER_ID_INVALID to an id to store in 'ids' (which is either vlan->uc or vlan->mc). This would WARN_ON_ONCE and then record a bogus filter ID of 0x1fff, neither of which is a good thing. Fixes: 0ccb998bf46d ("sfc: fix filter_id misinterpretation in edge case") Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index a8089667fc5b..78f9e43420e0 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -5105,6 +5105,7 @@ static int efx_ef10_filter_insert_addr_list(struct efx_nic *efx, /* Insert/renew filters */ for (i = 0; i < addr_count; i++) { + EFX_WARN_ON_PARANOID(ids[i] != EFX_EF10_FILTER_ID_INVALID); efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO, filter_flags, 0); efx_filter_set_eth_local(&spec, vlan->vid, addr_list[i].addr); rc = efx_ef10_filter_insert(efx, &spec, true); @@ -5122,11 +5123,11 @@ static int efx_ef10_filter_insert_addr_list(struct efx_nic *efx, } return rc; } else { - /* mark as not inserted, and carry on */ - rc = EFX_EF10_FILTER_ID_INVALID; + /* keep invalid ID, and carry on */ } + } else { + ids[i] = efx_ef10_filter_get_unsafe_id(rc); } - ids[i] = efx_ef10_filter_get_unsafe_id(rc); } if (multicast && rollback) {