Merge drm/drm-next into drm-misc-next
Backmerging 5.2-rc1 to -misc-next for robher Signed-off-by: Sean Paul <seanpaul@chromium.org>
This commit is contained in:
@@ -387,14 +387,14 @@ ForEachMacros:
|
|||||||
- 'rhl_for_each_entry_rcu'
|
- 'rhl_for_each_entry_rcu'
|
||||||
- 'rhl_for_each_rcu'
|
- 'rhl_for_each_rcu'
|
||||||
- 'rht_for_each'
|
- 'rht_for_each'
|
||||||
- 'rht_for_each_continue'
|
- 'rht_for_each_from'
|
||||||
- 'rht_for_each_entry'
|
- 'rht_for_each_entry'
|
||||||
- 'rht_for_each_entry_continue'
|
- 'rht_for_each_entry_from'
|
||||||
- 'rht_for_each_entry_rcu'
|
- 'rht_for_each_entry_rcu'
|
||||||
- 'rht_for_each_entry_rcu_continue'
|
- 'rht_for_each_entry_rcu_from'
|
||||||
- 'rht_for_each_entry_safe'
|
- 'rht_for_each_entry_safe'
|
||||||
- 'rht_for_each_rcu'
|
- 'rht_for_each_rcu'
|
||||||
- 'rht_for_each_rcu_continue'
|
- 'rht_for_each_rcu_from'
|
||||||
- '__rq_for_each_bio'
|
- '__rq_for_each_bio'
|
||||||
- 'rq_for_each_bvec'
|
- 'rq_for_each_bvec'
|
||||||
- 'rq_for_each_segment'
|
- 'rq_for_each_segment'
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
Christoph Hellwig <hch@lst.de>
|
Christoph Hellwig <hch@lst.de>
|
||||||
|
Marc Gonzalez <marc.w.gonzalez@free.fr>
|
||||||
|
|||||||
24
.gitignore
vendored
24
.gitignore
vendored
@@ -58,6 +58,7 @@ modules.builtin
|
|||||||
/vmlinuz
|
/vmlinuz
|
||||||
/System.map
|
/System.map
|
||||||
/Module.markers
|
/Module.markers
|
||||||
|
/modules.builtin.modinfo
|
||||||
|
|
||||||
#
|
#
|
||||||
# RPM spec file (make rpm-pkg)
|
# RPM spec file (make rpm-pkg)
|
||||||
@@ -80,20 +81,22 @@ modules.builtin
|
|||||||
/tar-install/
|
/tar-install/
|
||||||
|
|
||||||
#
|
#
|
||||||
# git files that we don't want to ignore even if they are dot-files
|
# We don't want to ignore the following even if they are dot-files
|
||||||
#
|
#
|
||||||
|
!.clang-format
|
||||||
|
!.cocciconfig
|
||||||
|
!.get_maintainer.ignore
|
||||||
|
!.gitattributes
|
||||||
!.gitignore
|
!.gitignore
|
||||||
!.mailmap
|
!.mailmap
|
||||||
!.cocciconfig
|
|
||||||
!.clang-format
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Generated include files
|
# Generated include files
|
||||||
#
|
#
|
||||||
include/config
|
/include/config/
|
||||||
include/generated
|
/include/generated/
|
||||||
include/ksym
|
/include/ksym/
|
||||||
arch/*/include/generated
|
/arch/*/include/generated/
|
||||||
|
|
||||||
# stgit generated dirs
|
# stgit generated dirs
|
||||||
patches-*
|
patches-*
|
||||||
@@ -129,7 +132,12 @@ signing_key.x509
|
|||||||
x509.genkey
|
x509.genkey
|
||||||
|
|
||||||
# Kconfig presets
|
# Kconfig presets
|
||||||
all.config
|
/all.config
|
||||||
|
/alldef.config
|
||||||
|
/allmod.config
|
||||||
|
/allno.config
|
||||||
|
/allrandom.config
|
||||||
|
/allyes.config
|
||||||
|
|
||||||
# Kdevelop4
|
# Kdevelop4
|
||||||
*.kdev4
|
*.kdev4
|
||||||
|
|||||||
16
.mailmap
16
.mailmap
@@ -16,6 +16,11 @@ Alan Cox <alan@lxorguk.ukuu.org.uk>
|
|||||||
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
||||||
Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
Aleksey Gorelov <aleksey_gorelov@phoenix.com>
|
||||||
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
Aleksandar Markovic <aleksandar.markovic@mips.com> <aleksandar.markovic@imgtec.com>
|
||||||
|
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@intel.com>
|
||||||
|
Alex Shi <alex.shi@linux.alibaba.com> <alex.shi@linaro.org>
|
||||||
|
Alexei Starovoitov <ast@kernel.org> <ast@plumgrid.com>
|
||||||
|
Alexei Starovoitov <ast@kernel.org> <alexei.starovoitov@gmail.com>
|
||||||
|
Alexei Starovoitov <ast@kernel.org> <ast@fb.com>
|
||||||
Al Viro <viro@ftp.linux.org.uk>
|
Al Viro <viro@ftp.linux.org.uk>
|
||||||
Al Viro <viro@zenIV.linux.org.uk>
|
Al Viro <viro@zenIV.linux.org.uk>
|
||||||
Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
|
Andi Shyti <andi@etezian.org> <andi.shyti@samsung.com>
|
||||||
@@ -46,6 +51,12 @@ Christoph Hellwig <hch@lst.de>
|
|||||||
Christophe Ricard <christophe.ricard@gmail.com>
|
Christophe Ricard <christophe.ricard@gmail.com>
|
||||||
Corey Minyard <minyard@acm.org>
|
Corey Minyard <minyard@acm.org>
|
||||||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <dborkman@redhat.com>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <dborkmann@redhat.com>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@iogearbox.net>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <daniel.borkmann@tik.ee.ethz.ch>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
|
||||||
|
Daniel Borkmann <daniel@iogearbox.net> <dxchgb@gmail.com>
|
||||||
David Brownell <david-b@pacbell.net>
|
David Brownell <david-b@pacbell.net>
|
||||||
David Woodhouse <dwmw2@shinybook.infradead.org>
|
David Woodhouse <dwmw2@shinybook.infradead.org>
|
||||||
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
Dengcheng Zhu <dzhu@wavecomp.com> <dengcheng.zhu@mips.com>
|
||||||
@@ -117,6 +128,8 @@ Leonid I Ananiev <leonid.i.ananiev@intel.com>
|
|||||||
Linas Vepstas <linas@austin.ibm.com>
|
Linas Vepstas <linas@austin.ibm.com>
|
||||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
||||||
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@ascom.ch>
|
||||||
|
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
||||||
|
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
||||||
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
||||||
Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
|
Marcin Nowakowski <marcin.nowakowski@mips.com> <marcin.nowakowski@imgtec.com>
|
||||||
Mark Brown <broonie@sirena.org.uk>
|
Mark Brown <broonie@sirena.org.uk>
|
||||||
@@ -189,6 +202,7 @@ Santosh Shilimkar <ssantosh@kernel.org>
|
|||||||
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
Santosh Shilimkar <santosh.shilimkar@oracle.org>
|
||||||
Sascha Hauer <s.hauer@pengutronix.de>
|
Sascha Hauer <s.hauer@pengutronix.de>
|
||||||
S.Çağlar Onur <caglar@pardus.org.tr>
|
S.Çağlar Onur <caglar@pardus.org.tr>
|
||||||
|
Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
|
||||||
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
|
||||||
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
|
||||||
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
|
||||||
@@ -207,6 +221,8 @@ Tejun Heo <htejun@gmail.com>
|
|||||||
Thomas Graf <tgraf@suug.ch>
|
Thomas Graf <tgraf@suug.ch>
|
||||||
Thomas Pedersen <twp@codeaurora.org>
|
Thomas Pedersen <twp@codeaurora.org>
|
||||||
Tony Luck <tony.luck@intel.com>
|
Tony Luck <tony.luck@intel.com>
|
||||||
|
TripleX Chung <xxx.phy@gmail.com> <zhongyu@18mail.cn>
|
||||||
|
TripleX Chung <xxx.phy@gmail.com> <triplex@zh-kernel.org>
|
||||||
Tsuneo Yoshioka <Tsuneo.Yoshioka@f-secure.com>
|
Tsuneo Yoshioka <Tsuneo.Yoshioka@f-secure.com>
|
||||||
Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
|
Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
|
||||||
Uwe Kleine-König <ukl@pengutronix.de>
|
Uwe Kleine-König <ukl@pengutronix.de>
|
||||||
|
|||||||
32
Documentation/ABI/obsolete/sysfs-class-net-batman-adv
Normal file
32
Documentation/ABI/obsolete/sysfs-class-net-batman-adv
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
This ABI is deprecated and will be removed after 2021. It is
|
||||||
|
replaced with the batadv generic netlink family.
|
||||||
|
|
||||||
|
What: /sys/class/net/<iface>/batman-adv/elp_interval
|
||||||
|
Date: Feb 2014
|
||||||
|
Contact: Linus Lüssing <linus.luessing@web.de>
|
||||||
|
Description:
|
||||||
|
Defines the interval in milliseconds in which batman
|
||||||
|
emits probing packets for neighbor sensing (ELP).
|
||||||
|
|
||||||
|
What: /sys/class/net/<iface>/batman-adv/iface_status
|
||||||
|
Date: May 2010
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
Indicates the status of <iface> as it is seen by batman.
|
||||||
|
|
||||||
|
What: /sys/class/net/<iface>/batman-adv/mesh_iface
|
||||||
|
Date: May 2010
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
The /sys/class/net/<iface>/batman-adv/mesh_iface file
|
||||||
|
displays the batman mesh interface this <iface>
|
||||||
|
currently is associated with.
|
||||||
|
|
||||||
|
What: /sys/class/net/<iface>/batman-adv/throughput_override
|
||||||
|
Date: Feb 2014
|
||||||
|
Contact: Antonio Quartulli <a@unstable.cc>
|
||||||
|
description:
|
||||||
|
Defines the throughput value to be used by B.A.T.M.A.N. V
|
||||||
|
when estimating the link throughput using this interface.
|
||||||
|
If the value is set to 0 then batman-adv will try to
|
||||||
|
estimate the throughput by itself.
|
||||||
110
Documentation/ABI/obsolete/sysfs-class-net-mesh
Normal file
110
Documentation/ABI/obsolete/sysfs-class-net-mesh
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
This ABI is deprecated and will be removed after 2021. It is
|
||||||
|
replaced with the batadv generic netlink family.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/aggregated_ogms
|
||||||
|
Date: May 2010
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
Indicates whether the batman protocol messages of the
|
||||||
|
mesh <mesh_iface> shall be aggregated or not.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/<vlan_subdir>/ap_isolation
|
||||||
|
Date: May 2011
|
||||||
|
Contact: Antonio Quartulli <a@unstable.cc>
|
||||||
|
Description:
|
||||||
|
Indicates whether the data traffic going from a
|
||||||
|
wireless client to another wireless client will be
|
||||||
|
silently dropped. <vlan_subdir> is empty when referring
|
||||||
|
to the untagged lan.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/bonding
|
||||||
|
Date: June 2010
|
||||||
|
Contact: Simon Wunderlich <sw@simonwunderlich.de>
|
||||||
|
Description:
|
||||||
|
Indicates whether the data traffic going through the
|
||||||
|
mesh will be sent using multiple interfaces at the
|
||||||
|
same time (if available).
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/bridge_loop_avoidance
|
||||||
|
Date: November 2011
|
||||||
|
Contact: Simon Wunderlich <sw@simonwunderlich.de>
|
||||||
|
Description:
|
||||||
|
Indicates whether the bridge loop avoidance feature
|
||||||
|
is enabled. This feature detects and avoids loops
|
||||||
|
between the mesh and devices bridged with the soft
|
||||||
|
interface <mesh_iface>.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/fragmentation
|
||||||
|
Date: October 2010
|
||||||
|
Contact: Andreas Langer <an.langer@gmx.de>
|
||||||
|
Description:
|
||||||
|
Indicates whether the data traffic going through the
|
||||||
|
mesh will be fragmented or silently discarded if the
|
||||||
|
packet size exceeds the outgoing interface MTU.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/gw_bandwidth
|
||||||
|
Date: October 2010
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
Defines the bandwidth which is propagated by this
|
||||||
|
node if gw_mode was set to 'server'.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/gw_mode
|
||||||
|
Date: October 2010
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
Defines the state of the gateway features. Can be
|
||||||
|
either 'off', 'client' or 'server'.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/gw_sel_class
|
||||||
|
Date: October 2010
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
Defines the selection criteria this node will use
|
||||||
|
to choose a gateway if gw_mode was set to 'client'.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/hop_penalty
|
||||||
|
Date: Oct 2010
|
||||||
|
Contact: Linus Lüssing <linus.luessing@web.de>
|
||||||
|
Description:
|
||||||
|
Defines the penalty which will be applied to an
|
||||||
|
originator message's tq-field on every hop.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/isolation_mark
|
||||||
|
Date: Nov 2013
|
||||||
|
Contact: Antonio Quartulli <a@unstable.cc>
|
||||||
|
Description:
|
||||||
|
Defines the isolation mark (and its bitmask) which
|
||||||
|
is used to classify clients as "isolated" by the
|
||||||
|
Extended Isolation feature.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/multicast_mode
|
||||||
|
Date: Feb 2014
|
||||||
|
Contact: Linus Lüssing <linus.luessing@web.de>
|
||||||
|
Description:
|
||||||
|
Indicates whether multicast optimizations are enabled
|
||||||
|
or disabled. If set to zero then all nodes in the
|
||||||
|
mesh are going to use classic flooding for any
|
||||||
|
multicast packet with no optimizations.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/network_coding
|
||||||
|
Date: Nov 2012
|
||||||
|
Contact: Martin Hundeboll <martin@hundeboll.net>
|
||||||
|
Description:
|
||||||
|
Controls whether Network Coding (using some magic
|
||||||
|
to send fewer wifi packets but still the same
|
||||||
|
content) is enabled or not.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/orig_interval
|
||||||
|
Date: May 2010
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
Defines the interval in milliseconds in which batman
|
||||||
|
sends its protocol messages.
|
||||||
|
|
||||||
|
What: /sys/class/net/<mesh_iface>/mesh/routing_algo
|
||||||
|
Date: Dec 2011
|
||||||
|
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
||||||
|
Description:
|
||||||
|
Defines the routing procotol this mesh instance
|
||||||
|
uses to find the optimal paths through the mesh.
|
||||||
@@ -6,6 +6,8 @@ Description:
|
|||||||
This file allows user to read/write the raw NVMEM contents.
|
This file allows user to read/write the raw NVMEM contents.
|
||||||
Permissions for write to this file depends on the nvmem
|
Permissions for write to this file depends on the nvmem
|
||||||
provider configuration.
|
provider configuration.
|
||||||
|
Note: This file is only present if CONFIG_NVMEM_SYSFS
|
||||||
|
is enabled
|
||||||
|
|
||||||
ex:
|
ex:
|
||||||
hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
|
hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
|
||||||
|
|||||||
@@ -81,7 +81,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/latency
|
|||||||
Date: September. 2017
|
Date: September. 2017
|
||||||
KernelVersion: 4.14
|
KernelVersion: 4.14
|
||||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||||
Description: Channel signaling latency
|
Description: Channel signaling latency. This file is available only for
|
||||||
|
performance critical channels (storage, network, etc.) that use
|
||||||
|
the monitor page mechanism.
|
||||||
Users: Debugging tools
|
Users: Debugging tools
|
||||||
|
|
||||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_mask
|
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/out_mask
|
||||||
@@ -95,7 +97,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/pending
|
|||||||
Date: September. 2017
|
Date: September. 2017
|
||||||
KernelVersion: 4.14
|
KernelVersion: 4.14
|
||||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||||
Description: Channel interrupt pending state
|
Description: Channel interrupt pending state. This file is available only for
|
||||||
|
performance critical channels (storage, network, etc.) that use
|
||||||
|
the monitor page mechanism.
|
||||||
Users: Debugging tools
|
Users: Debugging tools
|
||||||
|
|
||||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/read_avail
|
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/read_avail
|
||||||
@@ -137,7 +141,9 @@ What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/monitor_id
|
|||||||
Date: January. 2018
|
Date: January. 2018
|
||||||
KernelVersion: 4.16
|
KernelVersion: 4.16
|
||||||
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
Contact: Stephen Hemminger <sthemmin@microsoft.com>
|
||||||
Description: Monitor bit associated with channel
|
Description: Monitor bit associated with channel. This file is available only
|
||||||
|
for performance critical channels (storage, network, etc.) that
|
||||||
|
use the monitor page mechanism.
|
||||||
Users: Debugging tools and userspace drivers
|
Users: Debugging tools and userspace drivers
|
||||||
|
|
||||||
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/ring
|
What: /sys/bus/vmbus/devices/<UUID>/channels/<N>/ring
|
||||||
|
|||||||
@@ -90,4 +90,89 @@ Date: December 2009
|
|||||||
Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
|
Contact: Lee Schermerhorn <lee.schermerhorn@hp.com>
|
||||||
Description:
|
Description:
|
||||||
The node's huge page size control/query attributes.
|
The node's huge page size control/query attributes.
|
||||||
See Documentation/admin-guide/mm/hugetlbpage.rst
|
See Documentation/admin-guide/mm/hugetlbpage.rst
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The node's relationship to other nodes for access class "Y".
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The directory containing symlinks to memory initiator
|
||||||
|
nodes that have class "Y" access to this target node's
|
||||||
|
memory. CPUs and other memory initiators in nodes not in
|
||||||
|
the list accessing this node's memory may have different
|
||||||
|
performance.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/targets/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The directory containing symlinks to memory targets that
|
||||||
|
this initiator node has class "Y" access.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/read_bandwidth
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's read bandwidth in MB/s when accessed from
|
||||||
|
nodes found in this access class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/read_latency
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's read latency in nanoseconds when accessed
|
||||||
|
from nodes found in this access class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/write_bandwidth
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's write bandwidth in MB/s when accessed from
|
||||||
|
found in this access class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/accessY/initiators/write_latency
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
This node's write latency in nanoseconds when access
|
||||||
|
from nodes found in this class's linked initiators.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The directory containing attributes for the memory-side cache
|
||||||
|
level 'Y'.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/indexing
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The caches associativity indexing: 0 for direct mapped,
|
||||||
|
non-zero if indexed.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/line_size
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The number of bytes accessed from the next cache level on a
|
||||||
|
cache miss.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/size
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The size of this memory side cache in bytes.
|
||||||
|
|
||||||
|
What: /sys/devices/system/node/nodeX/memory_side_cache/indexY/write_policy
|
||||||
|
Date: December 2018
|
||||||
|
Contact: Keith Busch <keith.busch@intel.com>
|
||||||
|
Description:
|
||||||
|
The cache write policy: 0 for write-back, 1 for write-through,
|
||||||
|
other or unknown.
|
||||||
|
|||||||
@@ -1,23 +1,46 @@
|
|||||||
|
What: /sys/kernel/debug/wilco_ec/h1_gpio
|
||||||
|
Date: April 2019
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Description:
|
||||||
|
As part of Chrome OS's FAFT (Fully Automated Firmware Testing)
|
||||||
|
tests, we need to ensure that the H1 chip is properly setting
|
||||||
|
some GPIO lines. The h1_gpio attribute exposes the state
|
||||||
|
of the lines:
|
||||||
|
- ENTRY_TO_FACT_MODE in BIT(0)
|
||||||
|
- SPI_CHROME_SEL in BIT(1)
|
||||||
|
|
||||||
|
Output will formatted with "0x%02x\n".
|
||||||
|
|
||||||
What: /sys/kernel/debug/wilco_ec/raw
|
What: /sys/kernel/debug/wilco_ec/raw
|
||||||
Date: January 2019
|
Date: January 2019
|
||||||
KernelVersion: 5.1
|
KernelVersion: 5.1
|
||||||
Description:
|
Description:
|
||||||
Write and read raw mailbox commands to the EC.
|
Write and read raw mailbox commands to the EC.
|
||||||
|
|
||||||
For writing:
|
You can write a hexadecimal sentence to raw, and that series of
|
||||||
Bytes 0-1 indicate the message type:
|
bytes will be sent to the EC. Then, you can read the bytes of
|
||||||
00 F0 = Execute Legacy Command
|
response by reading from raw.
|
||||||
00 F2 = Read/Write NVRAM Property
|
|
||||||
Byte 2 provides the command code
|
|
||||||
Bytes 3+ consist of the data passed in the request
|
|
||||||
|
|
||||||
At least three bytes are required, for the msg type and command,
|
For writing, bytes 0-1 indicate the message type, one of enum
|
||||||
with additional bytes optional for additional data.
|
wilco_ec_msg_type. Byte 2+ consist of the data passed in the
|
||||||
|
request, starting at MBOX[0]
|
||||||
|
|
||||||
|
At least three bytes are required for writing, two for the type
|
||||||
|
and at least a single byte of data. Only the first
|
||||||
|
EC_MAILBOX_DATA_SIZE bytes of MBOX will be used.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
// Request EC info type 3 (EC firmware build date)
|
// Request EC info type 3 (EC firmware build date)
|
||||||
$ echo 00 f0 38 00 03 00 > raw
|
// Corresponds with sending type 0x00f0 with
|
||||||
|
// MBOX = [38, 00, 03, 00]
|
||||||
|
$ echo 00 f0 38 00 03 00 > /sys/kernel/debug/wilco_ec/raw
|
||||||
// View the result. The decoded ASCII result "12/21/18" is
|
// View the result. The decoded ASCII result "12/21/18" is
|
||||||
// included after the raw hex.
|
// included after the raw hex.
|
||||||
$ cat raw
|
// Corresponds with MBOX = [00, 00, 31, 32, 2f, 32, 31, 38, ...]
|
||||||
00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 .12/21/18.8...
|
$ cat /sys/kernel/debug/wilco_ec/raw
|
||||||
|
00 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 ..12/21/18.8...
|
||||||
|
|
||||||
|
Note that the first 32 bytes of the received MBOX[] will be
|
||||||
|
printed, even if some of the data is junk. It is up to you to
|
||||||
|
know how many of the first bytes of data are the actual
|
||||||
|
response.
|
||||||
|
|||||||
230
Documentation/ABI/testing/sysfs-bus-counter
Normal file
230
Documentation/ABI/testing/sysfs-bus-counter
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
What: /sys/bus/counter/devices/counterX/countY/count
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count data of Count Y represented as a string.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/ceiling
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count value ceiling for Count Y. This is the upper limit for the
|
||||||
|
respective counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/floor
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count value floor for Count Y. This is the lower limit for the
|
||||||
|
respective counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/count_mode
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count mode for channel Y. The ceiling and floor values for
|
||||||
|
Count Y are used by the count mode where required. The following
|
||||||
|
count modes are available:
|
||||||
|
|
||||||
|
normal:
|
||||||
|
Counting is continuous in either direction.
|
||||||
|
|
||||||
|
range limit:
|
||||||
|
An upper or lower limit is set, mimicking limit switches
|
||||||
|
in the mechanical counterpart. The upper limit is set to
|
||||||
|
the Count Y ceiling value, while the lower limit is set
|
||||||
|
to the Count Y floor value. The counter freezes at
|
||||||
|
count = ceiling when counting up, and at count = floor
|
||||||
|
when counting down. At either of these limits, the
|
||||||
|
counting is resumed only when the count direction is
|
||||||
|
reversed.
|
||||||
|
|
||||||
|
non-recycle:
|
||||||
|
The counter is disabled whenever a counter overflow or
|
||||||
|
underflow takes place. The counter is re-enabled when a
|
||||||
|
new count value is loaded to the counter via a preset
|
||||||
|
operation or direct write.
|
||||||
|
|
||||||
|
modulo-n:
|
||||||
|
A count value boundary is set between the Count Y floor
|
||||||
|
value and the Count Y ceiling value. The counter is
|
||||||
|
reset to the Count Y floor value at count = ceiling when
|
||||||
|
counting up, while the counter is set to the Count Y
|
||||||
|
ceiling value at count = floor when counting down; the
|
||||||
|
counter does not freeze at the boundary points, but
|
||||||
|
counts continuously throughout.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/count_mode_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/error_noise_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/function_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/signalZ_action_available
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Discrete set of available values for the respective Count Y
|
||||||
|
configuration are listed in this file. Values are delimited by
|
||||||
|
newline characters.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/direction
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the count direction of Count
|
||||||
|
Y. Two count directions are available: forward and backward.
|
||||||
|
|
||||||
|
Some counter devices are able to determine the direction of
|
||||||
|
their counting. For example, quadrature encoding counters can
|
||||||
|
determine the direction of movement by evaluating the leading
|
||||||
|
phase of the respective A and B quadrature encoding signals.
|
||||||
|
This attribute exposes such count directions.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/enable
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Whether channel Y counter is enabled. Valid attribute values are
|
||||||
|
boolean.
|
||||||
|
|
||||||
|
This attribute is intended to serve as a pause/unpause mechanism
|
||||||
|
for Count Y. Suppose a counter device is used to count the total
|
||||||
|
movement of a conveyor belt: this attribute allows an operator
|
||||||
|
to temporarily pause the counter, service the conveyor belt,
|
||||||
|
and then finally unpause the counter to continue where it had
|
||||||
|
left off.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/error_noise
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates whether excessive noise is
|
||||||
|
present at the channel Y counter inputs.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/function
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Count function mode of Count Y; count function evaluation is
|
||||||
|
triggered by conditions specified by the Count Y signalZ_action
|
||||||
|
attributes. The following count functions are available:
|
||||||
|
|
||||||
|
increase:
|
||||||
|
Accumulated count is incremented.
|
||||||
|
|
||||||
|
decrease:
|
||||||
|
Accumulated count is decremented.
|
||||||
|
|
||||||
|
pulse-direction:
|
||||||
|
Rising edges on signal A updates the respective count.
|
||||||
|
The input level of signal B determines direction.
|
||||||
|
|
||||||
|
quadrature x1 a:
|
||||||
|
If direction is forward, rising edges on quadrature pair
|
||||||
|
signal A updates the respective count; if the direction
|
||||||
|
is backward, falling edges on quadrature pair signal A
|
||||||
|
updates the respective count. Quadrature encoding
|
||||||
|
determines the direction.
|
||||||
|
|
||||||
|
quadrature x1 b:
|
||||||
|
If direction is forward, rising edges on quadrature pair
|
||||||
|
signal B updates the respective count; if the direction
|
||||||
|
is backward, falling edges on quadrature pair signal B
|
||||||
|
updates the respective count. Quadrature encoding
|
||||||
|
determines the direction.
|
||||||
|
|
||||||
|
quadrature x2 a:
|
||||||
|
Any state transition on quadrature pair signal A updates
|
||||||
|
the respective count. Quadrature encoding determines the
|
||||||
|
direction.
|
||||||
|
|
||||||
|
quadrature x2 b:
|
||||||
|
Any state transition on quadrature pair signal B updates
|
||||||
|
the respective count. Quadrature encoding determines the
|
||||||
|
direction.
|
||||||
|
|
||||||
|
quadrature x4:
|
||||||
|
Any state transition on either quadrature pair signals
|
||||||
|
updates the respective count. Quadrature encoding
|
||||||
|
determines the direction.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/name
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the device-specific name of
|
||||||
|
Count Y. If possible, this should match the name of the
|
||||||
|
respective channel as it appears in the device datasheet.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/preset
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
If the counter device supports preset registers -- registers
|
||||||
|
used to load counter channels to a set count upon device-defined
|
||||||
|
preset operation trigger events -- the preset count for channel
|
||||||
|
Y is provided by this attribute.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/preset_enable
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Whether channel Y counter preset operation is enabled. Valid
|
||||||
|
attribute values are boolean.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/signalZ_action
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Action mode of Count Y for Signal Z. This attribute indicates
|
||||||
|
the condition of Signal Z that triggers the count function
|
||||||
|
evaluation for Count Y. The following action modes are
|
||||||
|
available:
|
||||||
|
|
||||||
|
none:
|
||||||
|
Signal does not trigger the count function. In
|
||||||
|
Pulse-Direction count function mode, this Signal is
|
||||||
|
evaluated as Direction.
|
||||||
|
|
||||||
|
rising edge:
|
||||||
|
Low state transitions to high state.
|
||||||
|
|
||||||
|
falling edge:
|
||||||
|
High state transitions to low state.
|
||||||
|
|
||||||
|
both edges:
|
||||||
|
Any state transition.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/name
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the device-specific name of
|
||||||
|
the Counter. This should match the name of the device as it
|
||||||
|
appears in its respective datasheet.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/num_counts
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the total number of Counts
|
||||||
|
belonging to the Counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/num_signals
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the total number of Signals
|
||||||
|
belonging to the Counter.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/signal
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Signal data of Signal Y represented as a string.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/name
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the device-specific name of
|
||||||
|
Signal Y. If possible, this should match the name of the
|
||||||
|
respective signal as it appears in the device datasheet.
|
||||||
36
Documentation/ABI/testing/sysfs-bus-counter-104-quad-8
Normal file
36
Documentation/ABI/testing/sysfs-bus-counter-104-quad-8
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
What: /sys/bus/counter/devices/counterX/signalY/index_polarity
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Active level of index input Signal Y; irrelevant in
|
||||||
|
non-synchronous load mode.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/index_polarity_available
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode_available
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Discrete set of available values for the respective Signal Y
|
||||||
|
configuration are listed in this file.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Configure the counter associated with Signal Y for
|
||||||
|
non-synchronous or synchronous load mode. Synchronous load mode
|
||||||
|
cannot be selected in non-quadrature (Pulse-Direction) clock
|
||||||
|
mode.
|
||||||
|
|
||||||
|
non-synchronous:
|
||||||
|
A logic low level is the active level at this index
|
||||||
|
input. The index function (as enabled via preset_enable)
|
||||||
|
is performed directly on the active level of the index
|
||||||
|
input.
|
||||||
|
|
||||||
|
synchronous:
|
||||||
|
Intended for interfacing with encoder Index output in
|
||||||
|
quadrature clock mode. The active level is configured
|
||||||
|
via index_polarity. The index function (as enabled via
|
||||||
|
preset_enable) is performed synchronously with the
|
||||||
|
quadrature clock on the active level of the index input.
|
||||||
16
Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec
Normal file
16
Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
What: /sys/bus/counter/devices/counterX/countY/prescaler_available
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Discrete set of available values for the respective Count Y
|
||||||
|
configuration are listed in this file. Values are delimited by
|
||||||
|
newline characters.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/prescaler
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Configure the prescaler value associated with Count Y.
|
||||||
|
On the FlexTimer, the counter clock source passes through a
|
||||||
|
prescaler (i.e. a counter). This acts like a clock
|
||||||
|
divider.
|
||||||
20
Documentation/ABI/testing/sysfs-bus-i2c-devices-pca954x
Normal file
20
Documentation/ABI/testing/sysfs-bus-i2c-devices-pca954x
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
What: /sys/bus/i2c/.../idle_state
|
||||||
|
Date: January 2019
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: Robert Shearman <robert.shearman@att.com>
|
||||||
|
Description:
|
||||||
|
Value that exists only for mux devices that can be
|
||||||
|
written to control the behaviour of the multiplexer on
|
||||||
|
idle. Possible values:
|
||||||
|
-2 - disconnect on idle, i.e. deselect the last used
|
||||||
|
channel, which is useful when there is a device
|
||||||
|
with an address that conflicts with another
|
||||||
|
device on another mux on the same parent bus.
|
||||||
|
-1 - leave the mux as-is, which is the most optimal
|
||||||
|
setting in terms of I2C operations and is the
|
||||||
|
default mode.
|
||||||
|
0..<nchans> - set the mux to a predetermined channel,
|
||||||
|
which is useful if there is one channel that is
|
||||||
|
used almost always, and you want to reduce the
|
||||||
|
latency for normal operations after rare
|
||||||
|
transactions on other channels
|
||||||
@@ -1656,6 +1656,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_raw
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Raw counter device counts from channel Y. For quadrature
|
Raw counter device counts from channel Y. For quadrature
|
||||||
counters, multiplication by an available [Y]_scale results in
|
counters, multiplication by an available [Y]_scale results in
|
||||||
the counts of a single quadrature signal phase from channel Y.
|
the counts of a single quadrature signal phase from channel Y.
|
||||||
@@ -1664,6 +1666,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_raw
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Raw counter device index value from channel Y. This attribute
|
Raw counter device index value from channel Y. This attribute
|
||||||
provides an absolute positional reference (e.g. a pulse once per
|
provides an absolute positional reference (e.g. a pulse once per
|
||||||
revolution) which may be used to home positional systems as
|
revolution) which may be used to home positional systems as
|
||||||
@@ -1673,6 +1677,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available
|
|||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
A list of possible counting directions which are:
|
A list of possible counting directions which are:
|
||||||
- "up" : counter device is increasing.
|
- "up" : counter device is increasing.
|
||||||
- "down": counter device is decreasing.
|
- "down": counter device is decreasing.
|
||||||
@@ -1681,6 +1687,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_direction
|
|||||||
KernelVersion: 4.12
|
KernelVersion: 4.12
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Raw counter device counters direction for channel Y.
|
Raw counter device counters direction for channel Y.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_phaseY_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_phaseY_raw
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_index_synchronous_mode_available
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Discrete set of available values for the respective counter
|
Discrete set of available values for the respective counter
|
||||||
configuration are listed in this file.
|
configuration are listed in this file.
|
||||||
|
|
||||||
@@ -13,6 +15,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_mode
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Count mode for channel Y. Four count modes are available:
|
Count mode for channel Y. Four count modes are available:
|
||||||
normal, range limit, non-recycle, and modulo-n. The preset value
|
normal, range limit, non-recycle, and modulo-n. The preset value
|
||||||
for channel Y is used by the count mode where required.
|
for channel Y is used by the count mode where required.
|
||||||
@@ -47,6 +51,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_noise_error
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Read-only attribute that indicates whether excessive noise is
|
Read-only attribute that indicates whether excessive noise is
|
||||||
present at the channel Y count inputs in quadrature clock mode;
|
present at the channel Y count inputs in quadrature clock mode;
|
||||||
irrelevant in non-quadrature clock mode.
|
irrelevant in non-quadrature clock mode.
|
||||||
@@ -55,6 +61,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_preset
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
If the counter device supports preset registers, the preset
|
If the counter device supports preset registers, the preset
|
||||||
count for channel Y is provided by this attribute.
|
count for channel Y is provided by this attribute.
|
||||||
|
|
||||||
@@ -62,6 +70,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_quadrature_mode
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Configure channel Y counter for non-quadrature or quadrature
|
Configure channel Y counter for non-quadrature or quadrature
|
||||||
clock mode. Selecting non-quadrature clock mode will disable
|
clock mode. Selecting non-quadrature clock mode will disable
|
||||||
synchronous load mode. In quadrature clock mode, the channel Y
|
synchronous load mode. In quadrature clock mode, the channel Y
|
||||||
@@ -83,6 +93,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_countY_set_to_preset_on_index
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Whether to set channel Y counter with channel Y preset value
|
Whether to set channel Y counter with channel Y preset value
|
||||||
when channel Y index input is active, or continuously count.
|
when channel Y index input is active, or continuously count.
|
||||||
Valid attribute values are boolean.
|
Valid attribute values are boolean.
|
||||||
@@ -91,6 +103,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_index_polarity
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Active level of channel Y index input; irrelevant in
|
Active level of channel Y index input; irrelevant in
|
||||||
non-synchronous load mode.
|
non-synchronous load mode.
|
||||||
|
|
||||||
@@ -98,6 +112,8 @@ What: /sys/bus/iio/devices/iio:deviceX/in_indexY_synchronous_mode
|
|||||||
KernelVersion: 4.10
|
KernelVersion: 4.10
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
|
This interface is deprecated; please use the Counter subsystem.
|
||||||
|
|
||||||
Configure channel Y counter for non-synchronous or synchronous
|
Configure channel Y counter for non-synchronous or synchronous
|
||||||
load mode. Synchronous load mode cannot be selected in
|
load mode. Synchronous load mode cannot be selected in
|
||||||
non-quadrature clock mode.
|
non-quadrature clock mode.
|
||||||
|
|||||||
@@ -0,0 +1,35 @@
|
|||||||
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_start
|
||||||
|
Date: March 2019
|
||||||
|
KernelVersion: 3.1.0
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Frequency sweep start frequency in Hz.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_increment
|
||||||
|
Date: March 2019
|
||||||
|
KernelVersion: 3.1.0
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Frequency increment in Hz (step size) between consecutive
|
||||||
|
frequency points along the sweep.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency_points
|
||||||
|
Date: March 2019
|
||||||
|
KernelVersion: 3.1.0
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Number of frequency points (steps) in the frequency sweep.
|
||||||
|
This value, in conjunction with the
|
||||||
|
out_altvoltageY_frequency_start and the
|
||||||
|
out_altvoltageY_frequency_increment, determines the frequency
|
||||||
|
sweep range for the sweep operation.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_settling_cycles
|
||||||
|
Date: March 2019
|
||||||
|
KernelVersion: 3.1.0
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Number of output excitation cycles (settling time cycles)
|
||||||
|
that are allowed to pass through the unknown impedance,
|
||||||
|
after each frequency increment, and before the ADC is triggered
|
||||||
|
to perform a conversion sequence of the response signal.
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
|
What: /sys/bus/iio/devices/iio:deviceX/start_cleaning
|
||||||
Date: December 2018
|
Date: December 2018
|
||||||
KernelVersion: 4.22
|
KernelVersion: 5.0
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Writing 1 starts sensor self cleaning. Internal fan accelerates
|
Writing 1 starts sensor self cleaning. Internal fan accelerates
|
||||||
|
|||||||
24
Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
Normal file
24
Documentation/ABI/testing/sysfs-bus-iio-temperature-max31856
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
What: /sys/bus/iio/devices/iio:deviceX/fault_oc
|
||||||
|
KernelVersion: 5.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Open-circuit fault. The detection of open-circuit faults,
|
||||||
|
such as those caused by broken thermocouple wires.
|
||||||
|
Reading returns either '1' or '0'.
|
||||||
|
'1' = An open circuit such as broken thermocouple wires
|
||||||
|
has been detected.
|
||||||
|
'0' = No open circuit or broken thermocouple wires are detected
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/fault_ovuv
|
||||||
|
KernelVersion: 5.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Overvoltage or Undervoltage Input Fault. The internal circuitry
|
||||||
|
is protected from excessive voltages applied to the thermocouple
|
||||||
|
cables by integrated MOSFETs at the T+ and T- inputs, and the
|
||||||
|
BIAS output. These MOSFETs turn off when the input voltage is
|
||||||
|
negative or greater than VDD.
|
||||||
|
Reading returns either '1' or '0'.
|
||||||
|
'1' = The input voltage is negative or greater than VDD.
|
||||||
|
'0' = The input voltage is positive and less than VDD (normal
|
||||||
|
state).
|
||||||
@@ -30,4 +30,12 @@ Description: (RW) Configure MSC buffer size for "single" or "multi" modes.
|
|||||||
there are no active users and tracing is not enabled) and then
|
there are no active users and tracing is not enabled) and then
|
||||||
allocates a new one.
|
allocates a new one.
|
||||||
|
|
||||||
|
What: /sys/bus/intel_th/devices/<intel_th_id>-msc<msc-id>/win_switch
|
||||||
|
Date: May 2019
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: Alexander Shishkin <alexander.shishkin@linux.intel.com>
|
||||||
|
Description: (RW) Trigger window switch for the MSC's buffer, in
|
||||||
|
multi-window mode. In "multi" mode, accepts writes of "1", thereby
|
||||||
|
triggering a window switch for the buffer. Returns an error in any
|
||||||
|
other operating mode or attempts to write something other than "1".
|
||||||
|
|
||||||
|
|||||||
@@ -65,3 +65,18 @@ Description: Display the ME firmware version.
|
|||||||
<platform>:<major>.<minor>.<milestone>.<build_no>.
|
<platform>:<major>.<minor>.<milestone>.<build_no>.
|
||||||
There can be up to three such blocks for different
|
There can be up to three such blocks for different
|
||||||
FW components.
|
FW components.
|
||||||
|
|
||||||
|
What: /sys/class/mei/meiN/dev_state
|
||||||
|
Date: Mar 2019
|
||||||
|
KernelVersion: 5.1
|
||||||
|
Contact: Tomas Winkler <tomas.winkler@intel.com>
|
||||||
|
Description: Display the ME device state.
|
||||||
|
|
||||||
|
The device state can have following values:
|
||||||
|
INITIALIZING
|
||||||
|
INIT_CLIENTS
|
||||||
|
ENABLED
|
||||||
|
RESETTING
|
||||||
|
DISABLED
|
||||||
|
POWER_DOWN
|
||||||
|
POWER_UP
|
||||||
|
|||||||
@@ -1,30 +0,0 @@
|
|||||||
|
|
||||||
What: /sys/class/net/<iface>/batman-adv/elp_interval
|
|
||||||
Date: Feb 2014
|
|
||||||
Contact: Linus Lüssing <linus.luessing@web.de>
|
|
||||||
Description:
|
|
||||||
Defines the interval in milliseconds in which batman
|
|
||||||
emits probing packets for neighbor sensing (ELP).
|
|
||||||
|
|
||||||
What: /sys/class/net/<iface>/batman-adv/iface_status
|
|
||||||
Date: May 2010
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
Indicates the status of <iface> as it is seen by batman.
|
|
||||||
|
|
||||||
What: /sys/class/net/<iface>/batman-adv/mesh_iface
|
|
||||||
Date: May 2010
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
The /sys/class/net/<iface>/batman-adv/mesh_iface file
|
|
||||||
displays the batman mesh interface this <iface>
|
|
||||||
currently is associated with.
|
|
||||||
|
|
||||||
What: /sys/class/net/<iface>/batman-adv/throughput_override
|
|
||||||
Date: Feb 2014
|
|
||||||
Contact: Antonio Quartulli <a@unstable.cc>
|
|
||||||
description:
|
|
||||||
Defines the throughput value to be used by B.A.T.M.A.N. V
|
|
||||||
when estimating the link throughput using this interface.
|
|
||||||
If the value is set to 0 then batman-adv will try to
|
|
||||||
estimate the throughput by itself.
|
|
||||||
@@ -1,108 +0,0 @@
|
|||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/aggregated_ogms
|
|
||||||
Date: May 2010
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
Indicates whether the batman protocol messages of the
|
|
||||||
mesh <mesh_iface> shall be aggregated or not.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/<vlan_subdir>/ap_isolation
|
|
||||||
Date: May 2011
|
|
||||||
Contact: Antonio Quartulli <a@unstable.cc>
|
|
||||||
Description:
|
|
||||||
Indicates whether the data traffic going from a
|
|
||||||
wireless client to another wireless client will be
|
|
||||||
silently dropped. <vlan_subdir> is empty when referring
|
|
||||||
to the untagged lan.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/bonding
|
|
||||||
Date: June 2010
|
|
||||||
Contact: Simon Wunderlich <sw@simonwunderlich.de>
|
|
||||||
Description:
|
|
||||||
Indicates whether the data traffic going through the
|
|
||||||
mesh will be sent using multiple interfaces at the
|
|
||||||
same time (if available).
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/bridge_loop_avoidance
|
|
||||||
Date: November 2011
|
|
||||||
Contact: Simon Wunderlich <sw@simonwunderlich.de>
|
|
||||||
Description:
|
|
||||||
Indicates whether the bridge loop avoidance feature
|
|
||||||
is enabled. This feature detects and avoids loops
|
|
||||||
between the mesh and devices bridged with the soft
|
|
||||||
interface <mesh_iface>.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/fragmentation
|
|
||||||
Date: October 2010
|
|
||||||
Contact: Andreas Langer <an.langer@gmx.de>
|
|
||||||
Description:
|
|
||||||
Indicates whether the data traffic going through the
|
|
||||||
mesh will be fragmented or silently discarded if the
|
|
||||||
packet size exceeds the outgoing interface MTU.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/gw_bandwidth
|
|
||||||
Date: October 2010
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
Defines the bandwidth which is propagated by this
|
|
||||||
node if gw_mode was set to 'server'.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/gw_mode
|
|
||||||
Date: October 2010
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
Defines the state of the gateway features. Can be
|
|
||||||
either 'off', 'client' or 'server'.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/gw_sel_class
|
|
||||||
Date: October 2010
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
Defines the selection criteria this node will use
|
|
||||||
to choose a gateway if gw_mode was set to 'client'.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/hop_penalty
|
|
||||||
Date: Oct 2010
|
|
||||||
Contact: Linus Lüssing <linus.luessing@web.de>
|
|
||||||
Description:
|
|
||||||
Defines the penalty which will be applied to an
|
|
||||||
originator message's tq-field on every hop.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/isolation_mark
|
|
||||||
Date: Nov 2013
|
|
||||||
Contact: Antonio Quartulli <a@unstable.cc>
|
|
||||||
Description:
|
|
||||||
Defines the isolation mark (and its bitmask) which
|
|
||||||
is used to classify clients as "isolated" by the
|
|
||||||
Extended Isolation feature.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/multicast_mode
|
|
||||||
Date: Feb 2014
|
|
||||||
Contact: Linus Lüssing <linus.luessing@web.de>
|
|
||||||
Description:
|
|
||||||
Indicates whether multicast optimizations are enabled
|
|
||||||
or disabled. If set to zero then all nodes in the
|
|
||||||
mesh are going to use classic flooding for any
|
|
||||||
multicast packet with no optimizations.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/network_coding
|
|
||||||
Date: Nov 2012
|
|
||||||
Contact: Martin Hundeboll <martin@hundeboll.net>
|
|
||||||
Description:
|
|
||||||
Controls whether Network Coding (using some magic
|
|
||||||
to send fewer wifi packets but still the same
|
|
||||||
content) is enabled or not.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/orig_interval
|
|
||||||
Date: May 2010
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
Defines the interval in milliseconds in which batman
|
|
||||||
sends its protocol messages.
|
|
||||||
|
|
||||||
What: /sys/class/net/<mesh_iface>/mesh/routing_algo
|
|
||||||
Date: Dec 2011
|
|
||||||
Contact: Marek Lindner <mareklindner@neomailbox.ch>
|
|
||||||
Description:
|
|
||||||
Defines the routing procotol this mesh instance
|
|
||||||
uses to find the optimal paths through the mesh.
|
|
||||||
@@ -114,15 +114,60 @@ Description:
|
|||||||
Access: Read
|
Access: Read
|
||||||
Valid values: Represented in microamps
|
Valid values: Represented in microamps
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_limit
|
||||||
|
Date: Oct 2012
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Maximum allowable charging current. Used for charge rate
|
||||||
|
throttling for thermal cooling or improving battery health.
|
||||||
|
|
||||||
|
Access: Read, Write
|
||||||
|
Valid values: Represented in microamps
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_limit_max
|
||||||
|
Date: Oct 2012
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Maximum legal value for the charge_control_limit property.
|
||||||
|
|
||||||
|
Access: Read
|
||||||
|
Valid values: Represented in microamps
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_start_threshold
|
||||||
|
Date: April 2019
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Represents a battery percentage level, below which charging will
|
||||||
|
begin.
|
||||||
|
|
||||||
|
Access: Read, Write
|
||||||
|
Valid values: 0 - 100 (percent)
|
||||||
|
|
||||||
|
What: /sys/class/power_supply/<supply_name>/charge_control_end_threshold
|
||||||
|
Date: April 2019
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Represents a battery percentage level, above which charging will
|
||||||
|
stop.
|
||||||
|
|
||||||
|
Access: Read, Write
|
||||||
|
Valid values: 0 - 100 (percent)
|
||||||
|
|
||||||
What: /sys/class/power_supply/<supply_name>/charge_type
|
What: /sys/class/power_supply/<supply_name>/charge_type
|
||||||
Date: July 2009
|
Date: July 2009
|
||||||
Contact: linux-pm@vger.kernel.org
|
Contact: linux-pm@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Represents the type of charging currently being applied to the
|
Represents the type of charging currently being applied to the
|
||||||
battery.
|
battery. "Trickle", "Fast", and "Standard" all mean different
|
||||||
|
charging speeds. "Adaptive" means that the charger uses some
|
||||||
|
algorithm to adjust the charge rate dynamically, without
|
||||||
|
any user configuration required. "Custom" means that the charger
|
||||||
|
uses the charge_control_* properties as configuration for some
|
||||||
|
different algorithm.
|
||||||
|
|
||||||
Access: Read
|
Access: Read, Write
|
||||||
Valid values: "Unknown", "N/A", "Trickle", "Fast"
|
Valid values: "Unknown", "N/A", "Trickle", "Fast", "Standard",
|
||||||
|
"Adaptive", "Custom"
|
||||||
|
|
||||||
What: /sys/class/power_supply/<supply_name>/charge_term_current
|
What: /sys/class/power_supply/<supply_name>/charge_term_current
|
||||||
Date: July 2014
|
Date: July 2014
|
||||||
|
|||||||
@@ -212,7 +212,7 @@ Description:
|
|||||||
Messages may be broken into parts if
|
Messages may be broken into parts if
|
||||||
they are long.
|
they are long.
|
||||||
|
|
||||||
receieved_messages: (RO) Number of message responses
|
received_messages: (RO) Number of message responses
|
||||||
received.
|
received.
|
||||||
|
|
||||||
received_message_parts: (RO) Number of message fragments
|
received_message_parts: (RO) Number of message fragments
|
||||||
|
|||||||
@@ -484,6 +484,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
|||||||
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
||||||
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
||||||
/sys/devices/system/cpu/vulnerabilities/l1tf
|
/sys/devices/system/cpu/vulnerabilities/l1tf
|
||||||
|
/sys/devices/system/cpu/vulnerabilities/mds
|
||||||
Date: January 2018
|
Date: January 2018
|
||||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||||
Description: Information about CPU vulnerabilities
|
Description: Information about CPU vulnerabilities
|
||||||
@@ -496,8 +497,7 @@ Description: Information about CPU vulnerabilities
|
|||||||
"Vulnerable" CPU is affected and no mitigation in effect
|
"Vulnerable" CPU is affected and no mitigation in effect
|
||||||
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
||||||
|
|
||||||
Details about the l1tf file can be found in
|
See also: Documentation/admin-guide/hw-vuln/index.rst
|
||||||
Documentation/admin-guide/l1tf.rst
|
|
||||||
|
|
||||||
What: /sys/devices/system/cpu/smt
|
What: /sys/devices/system/cpu/smt
|
||||||
/sys/devices/system/cpu/smt/active
|
/sys/devices/system/cpu/smt/active
|
||||||
@@ -511,10 +511,30 @@ Description: Control Symetric Multi Threading (SMT)
|
|||||||
control: Read/write interface to control SMT. Possible
|
control: Read/write interface to control SMT. Possible
|
||||||
values:
|
values:
|
||||||
|
|
||||||
"on" SMT is enabled
|
"on" SMT is enabled
|
||||||
"off" SMT is disabled
|
"off" SMT is disabled
|
||||||
"forceoff" SMT is force disabled. Cannot be changed.
|
"forceoff" SMT is force disabled. Cannot be changed.
|
||||||
"notsupported" SMT is not supported by the CPU
|
"notsupported" SMT is not supported by the CPU
|
||||||
|
"notimplemented" SMT runtime toggling is not
|
||||||
|
implemented for the architecture
|
||||||
|
|
||||||
If control status is "forceoff" or "notsupported" writes
|
If control status is "forceoff" or "notsupported" writes
|
||||||
are rejected.
|
are rejected.
|
||||||
|
|
||||||
|
What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias
|
||||||
|
Date: March 2019
|
||||||
|
Contact: linux-pm@vger.kernel.org
|
||||||
|
Description: Intel Energy and Performance Bias Hint (EPB)
|
||||||
|
|
||||||
|
EPB for the given CPU in a sliding scale 0 - 15, where a value
|
||||||
|
of 0 corresponds to a hint preference for highest performance
|
||||||
|
and a value of 15 corresponds to the maximum energy savings.
|
||||||
|
|
||||||
|
In order to change the EPB value for the CPU, write either
|
||||||
|
a number in the 0 - 15 sliding scale above, or one of the
|
||||||
|
strings: "performance", "balance-performance", "normal",
|
||||||
|
"balance-power", "power" (that represent values reflected by
|
||||||
|
their meaning), to this attribute.
|
||||||
|
|
||||||
|
This attribute is present for all online CPUs supporting the
|
||||||
|
Intel EPB feature.
|
||||||
|
|||||||
6
Documentation/ABI/testing/sysfs-driver-ucsi-ccg
Normal file
6
Documentation/ABI/testing/sysfs-driver-ucsi-ccg
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
What: /sys/bus/i2c/drivers/ucsi_ccg/.../do_flash
|
||||||
|
Date: May 2019
|
||||||
|
Contact: Ajay Gupta <ajayg@nvidia.com>
|
||||||
|
Description:
|
||||||
|
Tell the driver for Cypress CCGx Type-C controller to attempt
|
||||||
|
firmware upgrade by writing [Yy1] to the file.
|
||||||
@@ -45,7 +45,7 @@ Description:
|
|||||||
use this feature without a clearance from a patch
|
use this feature without a clearance from a patch
|
||||||
distributor. Removal (rmmod) of patch modules is permanently
|
distributor. Removal (rmmod) of patch modules is permanently
|
||||||
disabled when the feature is used. See
|
disabled when the feature is used. See
|
||||||
Documentation/livepatch/livepatch.txt for more information.
|
Documentation/livepatch/livepatch.rst for more information.
|
||||||
|
|
||||||
What: /sys/kernel/livepatch/<patch>/<object>
|
What: /sys/kernel/livepatch/<patch>/<object>
|
||||||
Date: Nov 2014
|
Date: Nov 2014
|
||||||
|
|||||||
27
Documentation/ABI/testing/usb-uevent
Normal file
27
Documentation/ABI/testing/usb-uevent
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
What: Raise a uevent when a USB Host Controller has died
|
||||||
|
Date: 2019-04-17
|
||||||
|
KernelVersion: 5.2
|
||||||
|
Contact: linux-usb@vger.kernel.org
|
||||||
|
Description: When the USB Host Controller has entered a state where it is no
|
||||||
|
longer functional a uevent will be raised. The uevent will
|
||||||
|
contain ACTION=offline and ERROR=DEAD.
|
||||||
|
|
||||||
|
Here is an example taken using udevadm monitor -p:
|
||||||
|
|
||||||
|
KERNEL[130.428945] offline /devices/pci0000:00/0000:00:10.0/usb2 (usb)
|
||||||
|
ACTION=offline
|
||||||
|
BUSNUM=002
|
||||||
|
DEVNAME=/dev/bus/usb/002/001
|
||||||
|
DEVNUM=001
|
||||||
|
DEVPATH=/devices/pci0000:00/0000:00:10.0/usb2
|
||||||
|
DEVTYPE=usb_device
|
||||||
|
DRIVER=usb
|
||||||
|
ERROR=DEAD
|
||||||
|
MAJOR=189
|
||||||
|
MINOR=128
|
||||||
|
PRODUCT=1d6b/2/414
|
||||||
|
SEQNUM=2168
|
||||||
|
SUBSYSTEM=usb
|
||||||
|
TYPE=9/0/1
|
||||||
|
|
||||||
|
Users: chromium-os-dev@chromium.org
|
||||||
@@ -147,7 +147,7 @@ networking subsystems make sure that the buffers they use are valid
|
|||||||
for you to DMA from/to.
|
for you to DMA from/to.
|
||||||
|
|
||||||
DMA addressing capabilities
|
DMA addressing capabilities
|
||||||
==========================
|
===========================
|
||||||
|
|
||||||
By default, the kernel assumes that your device can address 32-bits of DMA
|
By default, the kernel assumes that your device can address 32-bits of DMA
|
||||||
addressing. For a 64-bit capable device, this needs to be increased, and for
|
addressing. For a 64-bit capable device, this needs to be increased, and for
|
||||||
@@ -365,13 +365,12 @@ __get_free_pages() (but takes size instead of a page order). If your
|
|||||||
driver needs regions sized smaller than a page, you may prefer using
|
driver needs regions sized smaller than a page, you may prefer using
|
||||||
the dma_pool interface, described below.
|
the dma_pool interface, described below.
|
||||||
|
|
||||||
The consistent DMA mapping interfaces, for non-NULL dev, will by
|
The consistent DMA mapping interfaces, will by default return a DMA address
|
||||||
default return a DMA address which is 32-bit addressable. Even if the
|
which is 32-bit addressable. Even if the device indicates (via the DMA mask)
|
||||||
device indicates (via DMA mask) that it may address the upper 32-bits,
|
that it may address the upper 32-bits, consistent allocation will only
|
||||||
consistent allocation will only return > 32-bit addresses for DMA if
|
return > 32-bit addresses for DMA if the consistent DMA mask has been
|
||||||
the consistent DMA mask has been explicitly changed via
|
explicitly changed via dma_set_coherent_mask(). This is true of the
|
||||||
dma_set_coherent_mask(). This is true of the dma_pool interface as
|
dma_pool interface as well.
|
||||||
well.
|
|
||||||
|
|
||||||
dma_alloc_coherent() returns two values: the virtual address which you
|
dma_alloc_coherent() returns two values: the virtual address which you
|
||||||
can use to access it from the CPU and dma_handle which you pass to the
|
can use to access it from the CPU and dma_handle which you pass to the
|
||||||
|
|||||||
@@ -28,8 +28,13 @@ ifeq ($(HAVE_SPHINX),0)
|
|||||||
|
|
||||||
else # HAVE_SPHINX
|
else # HAVE_SPHINX
|
||||||
|
|
||||||
# User-friendly check for pdflatex
|
# User-friendly check for pdflatex and latexmk
|
||||||
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||||
|
HAVE_LATEXMK := $(shell if which latexmk >/dev/null 2>&1; then echo 1; else echo 0; fi)
|
||||||
|
|
||||||
|
ifeq ($(HAVE_LATEXMK),1)
|
||||||
|
PDFLATEX := latexmk -$(PDFLATEX)
|
||||||
|
endif #HAVE_LATEXMK
|
||||||
|
|
||||||
# Internal variables.
|
# Internal variables.
|
||||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||||
@@ -82,7 +87,7 @@ pdfdocs:
|
|||||||
else # HAVE_PDFLATEX
|
else # HAVE_PDFLATEX
|
||||||
|
|
||||||
pdfdocs: latexdocs
|
pdfdocs: latexdocs
|
||||||
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX="$(PDFLATEX)" LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
|
||||||
|
|
||||||
endif # HAVE_PDFLATEX
|
endif # HAVE_PDFLATEX
|
||||||
|
|
||||||
|
|||||||
@@ -155,8 +155,7 @@ keeping lock contention under control at all tree levels regardless
|
|||||||
of the level of loading on the system.
|
of the level of loading on the system.
|
||||||
|
|
||||||
</p><p>RCU updaters wait for normal grace periods by registering
|
</p><p>RCU updaters wait for normal grace periods by registering
|
||||||
RCU callbacks, either directly via <tt>call_rcu()</tt> and
|
RCU callbacks, either directly via <tt>call_rcu()</tt>
|
||||||
friends (namely <tt>call_rcu_bh()</tt> and <tt>call_rcu_sched()</tt>),
|
|
||||||
or indirectly via <tt>synchronize_rcu()</tt> and friends.
|
or indirectly via <tt>synchronize_rcu()</tt> and friends.
|
||||||
RCU callbacks are represented by <tt>rcu_head</tt> structures,
|
RCU callbacks are represented by <tt>rcu_head</tt> structures,
|
||||||
which are queued on <tt>rcu_data</tt> structures while they are
|
which are queued on <tt>rcu_data</tt> structures while they are
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ sections.
|
|||||||
RCU-preempt Expedited Grace Periods</a></h2>
|
RCU-preempt Expedited Grace Periods</a></h2>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
<tt>CONFIG_PREEMPT=y</tt> kernels implement RCU-preempt.
|
||||||
The overall flow of the handling of a given CPU by an RCU-preempt
|
The overall flow of the handling of a given CPU by an RCU-preempt
|
||||||
expedited grace period is shown in the following diagram:
|
expedited grace period is shown in the following diagram:
|
||||||
|
|
||||||
@@ -139,6 +140,7 @@ or offline, among other things.
|
|||||||
RCU-sched Expedited Grace Periods</a></h2>
|
RCU-sched Expedited Grace Periods</a></h2>
|
||||||
|
|
||||||
<p>
|
<p>
|
||||||
|
<tt>CONFIG_PREEMPT=n</tt> kernels implement RCU-sched.
|
||||||
The overall flow of the handling of a given CPU by an RCU-sched
|
The overall flow of the handling of a given CPU by an RCU-sched
|
||||||
expedited grace period is shown in the following diagram:
|
expedited grace period is shown in the following diagram:
|
||||||
|
|
||||||
@@ -146,7 +148,7 @@ expedited grace period is shown in the following diagram:
|
|||||||
|
|
||||||
<p>
|
<p>
|
||||||
As with RCU-preempt, RCU-sched's
|
As with RCU-preempt, RCU-sched's
|
||||||
<tt>synchronize_sched_expedited()</tt> ignores offline and
|
<tt>synchronize_rcu_expedited()</tt> ignores offline and
|
||||||
idle CPUs, again because they are in remotely detectable
|
idle CPUs, again because they are in remotely detectable
|
||||||
quiescent states.
|
quiescent states.
|
||||||
However, because the
|
However, because the
|
||||||
|
|||||||
@@ -34,12 +34,11 @@ Similarly, any code that happens before the beginning of a given RCU grace
|
|||||||
period is guaranteed to see the effects of all accesses following the end
|
period is guaranteed to see the effects of all accesses following the end
|
||||||
of that grace period that are within RCU read-side critical sections.
|
of that grace period that are within RCU read-side critical sections.
|
||||||
|
|
||||||
<p>This guarantee is particularly pervasive for <tt>synchronize_sched()</tt>,
|
<p>Note well that RCU-sched read-side critical sections include any region
|
||||||
for which RCU-sched read-side critical sections include any region
|
|
||||||
of code for which preemption is disabled.
|
of code for which preemption is disabled.
|
||||||
Given that each individual machine instruction can be thought of as
|
Given that each individual machine instruction can be thought of as
|
||||||
an extremely small region of preemption-disabled code, one can think of
|
an extremely small region of preemption-disabled code, one can think of
|
||||||
<tt>synchronize_sched()</tt> as <tt>smp_mb()</tt> on steroids.
|
<tt>synchronize_rcu()</tt> as <tt>smp_mb()</tt> on steroids.
|
||||||
|
|
||||||
<p>RCU updaters use this guarantee by splitting their updates into
|
<p>RCU updaters use this guarantee by splitting their updates into
|
||||||
two phases, one of which is executed before the grace period and
|
two phases, one of which is executed before the grace period and
|
||||||
|
|||||||
@@ -81,18 +81,19 @@ currently executing on some other CPU. We therefore cannot free
|
|||||||
up any data structures used by the old NMI handler until execution
|
up any data structures used by the old NMI handler until execution
|
||||||
of it completes on all other CPUs.
|
of it completes on all other CPUs.
|
||||||
|
|
||||||
One way to accomplish this is via synchronize_sched(), perhaps as
|
One way to accomplish this is via synchronize_rcu(), perhaps as
|
||||||
follows:
|
follows:
|
||||||
|
|
||||||
unset_nmi_callback();
|
unset_nmi_callback();
|
||||||
synchronize_sched();
|
synchronize_rcu();
|
||||||
kfree(my_nmi_data);
|
kfree(my_nmi_data);
|
||||||
|
|
||||||
This works because synchronize_sched() blocks until all CPUs complete
|
This works because (as of v4.20) synchronize_rcu() blocks until all
|
||||||
any preemption-disabled segments of code that they were executing.
|
CPUs complete any preemption-disabled segments of code that they were
|
||||||
Since NMI handlers disable preemption, synchronize_sched() is guaranteed
|
executing.
|
||||||
|
Since NMI handlers disable preemption, synchronize_rcu() is guaranteed
|
||||||
not to return until all ongoing NMI handlers exit. It is therefore safe
|
not to return until all ongoing NMI handlers exit. It is therefore safe
|
||||||
to free up the handler's data as soon as synchronize_sched() returns.
|
to free up the handler's data as soon as synchronize_rcu() returns.
|
||||||
|
|
||||||
Important note: for this to work, the architecture in question must
|
Important note: for this to work, the architecture in question must
|
||||||
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively.
|
||||||
|
|||||||
@@ -86,10 +86,8 @@ even on a UP system. So do not do it! Even on a UP system, the RCU
|
|||||||
infrastructure -must- respect grace periods, and -must- invoke callbacks
|
infrastructure -must- respect grace periods, and -must- invoke callbacks
|
||||||
from a known environment in which no locks are held.
|
from a known environment in which no locks are held.
|
||||||
|
|
||||||
It -is- safe for synchronize_sched() and synchronize_rcu_bh() to return
|
Note that it -is- safe for synchronize_rcu() to return immediately on
|
||||||
immediately on an UP system. It is also safe for synchronize_rcu()
|
UP systems, including !PREEMPT SMP builds running on UP systems.
|
||||||
to return immediately on UP systems, except when running preemptable
|
|
||||||
RCU.
|
|
||||||
|
|
||||||
Quick Quiz #3: Why can't synchronize_rcu() return immediately on
|
Quick Quiz #3: Why can't synchronize_rcu() return immediately on
|
||||||
UP systems running preemptable RCU?
|
UP systems running preemptable RCU?
|
||||||
|
|||||||
@@ -182,16 +182,13 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
when publicizing a pointer to a structure that can
|
when publicizing a pointer to a structure that can
|
||||||
be traversed by an RCU read-side critical section.
|
be traversed by an RCU read-side critical section.
|
||||||
|
|
||||||
5. If call_rcu(), or a related primitive such as call_rcu_bh(),
|
5. If call_rcu() or call_srcu() is used, the callback function will
|
||||||
call_rcu_sched(), or call_srcu() is used, the callback function
|
be called from softirq context. In particular, it cannot block.
|
||||||
will be called from softirq context. In particular, it cannot
|
|
||||||
block.
|
|
||||||
|
|
||||||
6. Since synchronize_rcu() can block, it cannot be called from
|
6. Since synchronize_rcu() can block, it cannot be called
|
||||||
any sort of irq context. The same rule applies for
|
from any sort of irq context. The same rule applies
|
||||||
synchronize_rcu_bh(), synchronize_sched(), synchronize_srcu(),
|
for synchronize_srcu(), synchronize_rcu_expedited(), and
|
||||||
synchronize_rcu_expedited(), synchronize_rcu_bh_expedited(),
|
synchronize_srcu_expedited().
|
||||||
synchronize_sched_expedite(), and synchronize_srcu_expedited().
|
|
||||||
|
|
||||||
The expedited forms of these primitives have the same semantics
|
The expedited forms of these primitives have the same semantics
|
||||||
as the non-expedited forms, but expediting is both expensive and
|
as the non-expedited forms, but expediting is both expensive and
|
||||||
@@ -212,20 +209,20 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
of the system, especially to real-time workloads running on
|
of the system, especially to real-time workloads running on
|
||||||
the rest of the system.
|
the rest of the system.
|
||||||
|
|
||||||
7. If the updater uses call_rcu() or synchronize_rcu(), then the
|
7. As of v4.20, a given kernel implements only one RCU flavor,
|
||||||
corresponding readers must use rcu_read_lock() and
|
which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y.
|
||||||
rcu_read_unlock(). If the updater uses call_rcu_bh() or
|
If the updater uses call_rcu() or synchronize_rcu(),
|
||||||
synchronize_rcu_bh(), then the corresponding readers must
|
then the corresponding readers my use rcu_read_lock() and
|
||||||
use rcu_read_lock_bh() and rcu_read_unlock_bh(). If the
|
rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(),
|
||||||
updater uses call_rcu_sched() or synchronize_sched(), then
|
or any pair of primitives that disables and re-enables preemption,
|
||||||
the corresponding readers must disable preemption, possibly
|
for example, rcu_read_lock_sched() and rcu_read_unlock_sched().
|
||||||
by calling rcu_read_lock_sched() and rcu_read_unlock_sched().
|
If the updater uses synchronize_srcu() or call_srcu(),
|
||||||
If the updater uses synchronize_srcu() or call_srcu(), then
|
then the corresponding readers must use srcu_read_lock() and
|
||||||
the corresponding readers must use srcu_read_lock() and
|
|
||||||
srcu_read_unlock(), and with the same srcu_struct. The rules for
|
srcu_read_unlock(), and with the same srcu_struct. The rules for
|
||||||
the expedited primitives are the same as for their non-expedited
|
the expedited primitives are the same as for their non-expedited
|
||||||
counterparts. Mixing things up will result in confusion and
|
counterparts. Mixing things up will result in confusion and
|
||||||
broken kernels.
|
broken kernels, and has even resulted in an exploitable security
|
||||||
|
issue.
|
||||||
|
|
||||||
One exception to this rule: rcu_read_lock() and rcu_read_unlock()
|
One exception to this rule: rcu_read_lock() and rcu_read_unlock()
|
||||||
may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
|
may be substituted for rcu_read_lock_bh() and rcu_read_unlock_bh()
|
||||||
@@ -288,8 +285,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||||
number of updates per grace period.
|
number of updates per grace period.
|
||||||
|
|
||||||
The same cautions apply to call_rcu_bh(), call_rcu_sched(),
|
The same cautions apply to call_srcu() and kfree_rcu().
|
||||||
call_srcu(), and kfree_rcu().
|
|
||||||
|
|
||||||
Note that although these primitives do take action to avoid memory
|
Note that although these primitives do take action to avoid memory
|
||||||
exhaustion when any given CPU has too many callbacks, a determined
|
exhaustion when any given CPU has too many callbacks, a determined
|
||||||
@@ -322,7 +318,7 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
|
|
||||||
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
11. Any lock acquired by an RCU callback must be acquired elsewhere
|
||||||
with softirq disabled, e.g., via spin_lock_irqsave(),
|
with softirq disabled, e.g., via spin_lock_irqsave(),
|
||||||
spin_lock_bh(), etc. Failing to disable irq on a given
|
spin_lock_bh(), etc. Failing to disable softirq on a given
|
||||||
acquisition of that lock will result in deadlock as soon as
|
acquisition of that lock will result in deadlock as soon as
|
||||||
the RCU softirq handler happens to run your RCU callback while
|
the RCU softirq handler happens to run your RCU callback while
|
||||||
interrupting that acquisition's critical section.
|
interrupting that acquisition's critical section.
|
||||||
@@ -335,13 +331,16 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
must use whatever locking or other synchronization is required
|
must use whatever locking or other synchronization is required
|
||||||
to safely access and/or modify that data structure.
|
to safely access and/or modify that data structure.
|
||||||
|
|
||||||
RCU callbacks are -usually- executed on the same CPU that executed
|
Do not assume that RCU callbacks will be executed on the same
|
||||||
the corresponding call_rcu(), call_rcu_bh(), or call_rcu_sched(),
|
CPU that executed the corresponding call_rcu() or call_srcu().
|
||||||
but are by -no- means guaranteed to be. For example, if a given
|
For example, if a given CPU goes offline while having an RCU
|
||||||
CPU goes offline while having an RCU callback pending, then that
|
callback pending, then that RCU callback will execute on some
|
||||||
RCU callback will execute on some surviving CPU. (If this was
|
surviving CPU. (If this was not the case, a self-spawning RCU
|
||||||
not the case, a self-spawning RCU callback would prevent the
|
callback would prevent the victim CPU from ever going offline.)
|
||||||
victim CPU from ever going offline.)
|
Furthermore, CPUs designated by rcu_nocbs= might well -always-
|
||||||
|
have their RCU callbacks executed on some other CPUs, in fact,
|
||||||
|
for some real-time workloads, this is the whole point of using
|
||||||
|
the rcu_nocbs= kernel boot parameter.
|
||||||
|
|
||||||
13. Unlike other forms of RCU, it -is- permissible to block in an
|
13. Unlike other forms of RCU, it -is- permissible to block in an
|
||||||
SRCU read-side critical section (demarked by srcu_read_lock()
|
SRCU read-side critical section (demarked by srcu_read_lock()
|
||||||
@@ -381,11 +380,11 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
|
|
||||||
SRCU's expedited primitive (synchronize_srcu_expedited())
|
SRCU's expedited primitive (synchronize_srcu_expedited())
|
||||||
never sends IPIs to other CPUs, so it is easier on
|
never sends IPIs to other CPUs, so it is easier on
|
||||||
real-time workloads than is synchronize_rcu_expedited(),
|
real-time workloads than is synchronize_rcu_expedited().
|
||||||
synchronize_rcu_bh_expedited() or synchronize_sched_expedited().
|
|
||||||
|
|
||||||
Note that rcu_dereference() and rcu_assign_pointer() relate to
|
Note that rcu_assign_pointer() relates to SRCU just as it does to
|
||||||
SRCU just as they do to other forms of RCU.
|
other forms of RCU, but instead of rcu_dereference() you should
|
||||||
|
use srcu_dereference() in order to avoid lockdep splats.
|
||||||
|
|
||||||
14. The whole point of call_rcu(), synchronize_rcu(), and friends
|
14. The whole point of call_rcu(), synchronize_rcu(), and friends
|
||||||
is to wait until all pre-existing readers have finished before
|
is to wait until all pre-existing readers have finished before
|
||||||
@@ -405,6 +404,9 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
read-side critical sections. It is the responsibility of the
|
read-side critical sections. It is the responsibility of the
|
||||||
RCU update-side primitives to deal with this.
|
RCU update-side primitives to deal with this.
|
||||||
|
|
||||||
|
For SRCU readers, you can use smp_mb__after_srcu_read_unlock()
|
||||||
|
immediately after an srcu_read_unlock() to get a full barrier.
|
||||||
|
|
||||||
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
16. Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
|
||||||
__rcu sparse checks to validate your RCU code. These can help
|
__rcu sparse checks to validate your RCU code. These can help
|
||||||
find problems as follows:
|
find problems as follows:
|
||||||
@@ -428,22 +430,19 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
These debugging aids can help you find problems that are
|
These debugging aids can help you find problems that are
|
||||||
otherwise extremely difficult to spot.
|
otherwise extremely difficult to spot.
|
||||||
|
|
||||||
17. If you register a callback using call_rcu(), call_rcu_bh(),
|
17. If you register a callback using call_rcu() or call_srcu(), and
|
||||||
call_rcu_sched(), or call_srcu(), and pass in a function defined
|
pass in a function defined within a loadable module, then it in
|
||||||
within a loadable module, then it in necessary to wait for
|
necessary to wait for all pending callbacks to be invoked after
|
||||||
all pending callbacks to be invoked after the last invocation
|
the last invocation and before unloading that module. Note that
|
||||||
and before unloading that module. Note that it is absolutely
|
it is absolutely -not- sufficient to wait for a grace period!
|
||||||
-not- sufficient to wait for a grace period! The current (say)
|
The current (say) synchronize_rcu() implementation is -not-
|
||||||
synchronize_rcu() implementation waits only for all previous
|
guaranteed to wait for callbacks registered on other CPUs.
|
||||||
callbacks registered on the CPU that synchronize_rcu() is running
|
Or even on the current CPU if that CPU recently went offline
|
||||||
on, but it is -not- guaranteed to wait for callbacks registered
|
and came back online.
|
||||||
on other CPUs.
|
|
||||||
|
|
||||||
You instead need to use one of the barrier functions:
|
You instead need to use one of the barrier functions:
|
||||||
|
|
||||||
o call_rcu() -> rcu_barrier()
|
o call_rcu() -> rcu_barrier()
|
||||||
o call_rcu_bh() -> rcu_barrier()
|
|
||||||
o call_rcu_sched() -> rcu_barrier()
|
|
||||||
o call_srcu() -> srcu_barrier()
|
o call_srcu() -> srcu_barrier()
|
||||||
|
|
||||||
However, these barrier functions are absolutely -not- guaranteed
|
However, these barrier functions are absolutely -not- guaranteed
|
||||||
|
|||||||
@@ -52,10 +52,10 @@ o If I am running on a uniprocessor kernel, which can only do one
|
|||||||
o How can I see where RCU is currently used in the Linux kernel?
|
o How can I see where RCU is currently used in the Linux kernel?
|
||||||
|
|
||||||
Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
|
Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
|
||||||
"rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh",
|
"rcu_read_lock_bh", "rcu_read_unlock_bh", "srcu_read_lock",
|
||||||
"srcu_read_lock", "srcu_read_unlock", "synchronize_rcu",
|
"srcu_read_unlock", "synchronize_rcu", "synchronize_net",
|
||||||
"synchronize_net", "synchronize_srcu", and the other RCU
|
"synchronize_srcu", and the other RCU primitives. Or grab one
|
||||||
primitives. Or grab one of the cscope databases from:
|
of the cscope databases from:
|
||||||
|
|
||||||
http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
|
http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
|
||||||
|
|
||||||
|
|||||||
@@ -351,3 +351,106 @@ garbage values.
|
|||||||
|
|
||||||
In short, rcu_dereference() is -not- optional when you are going to
|
In short, rcu_dereference() is -not- optional when you are going to
|
||||||
dereference the resulting pointer.
|
dereference the resulting pointer.
|
||||||
|
|
||||||
|
|
||||||
|
WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE?
|
||||||
|
|
||||||
|
First, please avoid using rcu_dereference_raw() and also please avoid
|
||||||
|
using rcu_dereference_check() and rcu_dereference_protected() with a
|
||||||
|
second argument with a constant value of 1 (or true, for that matter).
|
||||||
|
With that caution out of the way, here is some guidance for which
|
||||||
|
member of the rcu_dereference() to use in various situations:
|
||||||
|
|
||||||
|
1. If the access needs to be within an RCU read-side critical
|
||||||
|
section, use rcu_dereference(). With the new consolidated
|
||||||
|
RCU flavors, an RCU read-side critical section is entered
|
||||||
|
using rcu_read_lock(), anything that disables bottom halves,
|
||||||
|
anything that disables interrupts, or anything that disables
|
||||||
|
preemption.
|
||||||
|
|
||||||
|
2. If the access might be within an RCU read-side critical section
|
||||||
|
on the one hand, or protected by (say) my_lock on the other,
|
||||||
|
use rcu_dereference_check(), for example:
|
||||||
|
|
||||||
|
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||||
|
lockdep_is_held(&my_lock));
|
||||||
|
|
||||||
|
|
||||||
|
3. If the access might be within an RCU read-side critical section
|
||||||
|
on the one hand, or protected by either my_lock or your_lock on
|
||||||
|
the other, again use rcu_dereference_check(), for example:
|
||||||
|
|
||||||
|
p1 = rcu_dereference_check(p->rcu_protected_pointer,
|
||||||
|
lockdep_is_held(&my_lock) ||
|
||||||
|
lockdep_is_held(&your_lock));
|
||||||
|
|
||||||
|
4. If the access is on the update side, so that it is always protected
|
||||||
|
by my_lock, use rcu_dereference_protected():
|
||||||
|
|
||||||
|
p1 = rcu_dereference_protected(p->rcu_protected_pointer,
|
||||||
|
lockdep_is_held(&my_lock));
|
||||||
|
|
||||||
|
This can be extended to handle multiple locks as in #3 above,
|
||||||
|
and both can be extended to check other conditions as well.
|
||||||
|
|
||||||
|
5. If the protection is supplied by the caller, and is thus unknown
|
||||||
|
to this code, that is the rare case when rcu_dereference_raw()
|
||||||
|
is appropriate. In addition, rcu_dereference_raw() might be
|
||||||
|
appropriate when the lockdep expression would be excessively
|
||||||
|
complex, except that a better approach in that case might be to
|
||||||
|
take a long hard look at your synchronization design. Still,
|
||||||
|
there are data-locking cases where any one of a very large number
|
||||||
|
of locks or reference counters suffices to protect the pointer,
|
||||||
|
so rcu_dereference_raw() does have its place.
|
||||||
|
|
||||||
|
However, its place is probably quite a bit smaller than one
|
||||||
|
might expect given the number of uses in the current kernel.
|
||||||
|
Ditto for its synonym, rcu_dereference_check( ... , 1), and
|
||||||
|
its close relative, rcu_dereference_protected(... , 1).
|
||||||
|
|
||||||
|
|
||||||
|
SPARSE CHECKING OF RCU-PROTECTED POINTERS
|
||||||
|
|
||||||
|
The sparse static-analysis tool checks for direct access to RCU-protected
|
||||||
|
pointers, which can result in "interesting" bugs due to compiler
|
||||||
|
optimizations involving invented loads and perhaps also load tearing.
|
||||||
|
For example, suppose someone mistakenly does something like this:
|
||||||
|
|
||||||
|
p = q->rcu_protected_pointer;
|
||||||
|
do_something_with(p->a);
|
||||||
|
do_something_else_with(p->b);
|
||||||
|
|
||||||
|
If register pressure is high, the compiler might optimize "p" out
|
||||||
|
of existence, transforming the code to something like this:
|
||||||
|
|
||||||
|
do_something_with(q->rcu_protected_pointer->a);
|
||||||
|
do_something_else_with(q->rcu_protected_pointer->b);
|
||||||
|
|
||||||
|
This could fatally disappoint your code if q->rcu_protected_pointer
|
||||||
|
changed in the meantime. Nor is this a theoretical problem: Exactly
|
||||||
|
this sort of bug cost Paul E. McKenney (and several of his innocent
|
||||||
|
colleagues) a three-day weekend back in the early 1990s.
|
||||||
|
|
||||||
|
Load tearing could of course result in dereferencing a mashup of a pair
|
||||||
|
of pointers, which also might fatally disappoint your code.
|
||||||
|
|
||||||
|
These problems could have been avoided simply by making the code instead
|
||||||
|
read as follows:
|
||||||
|
|
||||||
|
p = rcu_dereference(q->rcu_protected_pointer);
|
||||||
|
do_something_with(p->a);
|
||||||
|
do_something_else_with(p->b);
|
||||||
|
|
||||||
|
Unfortunately, these sorts of bugs can be extremely hard to spot during
|
||||||
|
review. This is where the sparse tool comes into play, along with the
|
||||||
|
"__rcu" marker. If you mark a pointer declaration, whether in a structure
|
||||||
|
or as a formal parameter, with "__rcu", which tells sparse to complain if
|
||||||
|
this pointer is accessed directly. It will also cause sparse to complain
|
||||||
|
if a pointer not marked with "__rcu" is accessed using rcu_dereference()
|
||||||
|
and friends. For example, ->rcu_protected_pointer might be declared as
|
||||||
|
follows:
|
||||||
|
|
||||||
|
struct foo __rcu *rcu_protected_pointer;
|
||||||
|
|
||||||
|
Use of "__rcu" is opt-in. If you choose not to use it, then you should
|
||||||
|
ignore the sparse warnings.
|
||||||
|
|||||||
@@ -83,16 +83,15 @@ Pseudo-code using rcu_barrier() is as follows:
|
|||||||
2. Execute rcu_barrier().
|
2. Execute rcu_barrier().
|
||||||
3. Allow the module to be unloaded.
|
3. Allow the module to be unloaded.
|
||||||
|
|
||||||
There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier()
|
There is also an srcu_barrier() function for SRCU, and you of course
|
||||||
functions for the other flavors of RCU, and you of course must match
|
must match the flavor of rcu_barrier() with that of call_rcu(). If your
|
||||||
the flavor of rcu_barrier() with that of call_rcu(). If your module
|
module uses multiple flavors of call_rcu(), then it must also use multiple
|
||||||
uses multiple flavors of call_rcu(), then it must also use multiple
|
|
||||||
flavors of rcu_barrier() when unloading that module. For example, if
|
flavors of rcu_barrier() when unloading that module. For example, if
|
||||||
it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on
|
it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on
|
||||||
srcu_struct_2(), then the following three lines of code will be required
|
srcu_struct_2(), then the following three lines of code will be required
|
||||||
when unloading:
|
when unloading:
|
||||||
|
|
||||||
1 rcu_barrier_bh();
|
1 rcu_barrier();
|
||||||
2 srcu_barrier(&srcu_struct_1);
|
2 srcu_barrier(&srcu_struct_1);
|
||||||
3 srcu_barrier(&srcu_struct_2);
|
3 srcu_barrier(&srcu_struct_2);
|
||||||
|
|
||||||
@@ -185,12 +184,12 @@ module invokes call_rcu() from timers, you will need to first cancel all
|
|||||||
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
the timers, and only then invoke rcu_barrier() to wait for any remaining
|
||||||
RCU callbacks to complete.
|
RCU callbacks to complete.
|
||||||
|
|
||||||
Of course, if you module uses call_rcu_bh(), you will need to invoke
|
Of course, if you module uses call_rcu(), you will need to invoke
|
||||||
rcu_barrier_bh() before unloading. Similarly, if your module uses
|
rcu_barrier() before unloading. Similarly, if your module uses
|
||||||
call_rcu_sched(), you will need to invoke rcu_barrier_sched() before
|
call_srcu(), you will need to invoke srcu_barrier() before unloading,
|
||||||
unloading. If your module uses call_rcu(), call_rcu_bh(), -and-
|
and on the same srcu_struct structure. If your module uses call_rcu()
|
||||||
call_rcu_sched(), then you will need to invoke each of rcu_barrier(),
|
-and- call_srcu(), then you will need to invoke rcu_barrier() -and-
|
||||||
rcu_barrier_bh(), and rcu_barrier_sched().
|
srcu_barrier().
|
||||||
|
|
||||||
|
|
||||||
Implementing rcu_barrier()
|
Implementing rcu_barrier()
|
||||||
@@ -223,8 +222,8 @@ shown below. Note that the final "1" in on_each_cpu()'s argument list
|
|||||||
ensures that all the calls to rcu_barrier_func() will have completed
|
ensures that all the calls to rcu_barrier_func() will have completed
|
||||||
before on_each_cpu() returns. Line 9 then waits for the completion.
|
before on_each_cpu() returns. Line 9 then waits for the completion.
|
||||||
|
|
||||||
This code was rewritten in 2008 to support rcu_barrier_bh() and
|
This code was rewritten in 2008 and several times thereafter, but this
|
||||||
rcu_barrier_sched() in addition to the original rcu_barrier().
|
still gives the general idea.
|
||||||
|
|
||||||
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
The rcu_barrier_func() runs on each CPU, where it invokes call_rcu()
|
||||||
to post an RCU callback, as follows:
|
to post an RCU callback, as follows:
|
||||||
|
|||||||
@@ -310,7 +310,7 @@ reader, updater, and reclaimer.
|
|||||||
|
|
||||||
|
|
||||||
rcu_assign_pointer()
|
rcu_assign_pointer()
|
||||||
+--------+
|
+--------+
|
||||||
+---------------------->| reader |---------+
|
+---------------------->| reader |---------+
|
||||||
| +--------+ |
|
| +--------+ |
|
||||||
| | |
|
| | |
|
||||||
@@ -318,12 +318,12 @@ reader, updater, and reclaimer.
|
|||||||
| | | rcu_read_lock()
|
| | | rcu_read_lock()
|
||||||
| | | rcu_read_unlock()
|
| | | rcu_read_unlock()
|
||||||
| rcu_dereference() | |
|
| rcu_dereference() | |
|
||||||
+---------+ | |
|
+---------+ | |
|
||||||
| updater |<---------------------+ |
|
| updater |<----------------+ |
|
||||||
+---------+ V
|
+---------+ V
|
||||||
| +-----------+
|
| +-----------+
|
||||||
+----------------------------------->| reclaimer |
|
+----------------------------------->| reclaimer |
|
||||||
+-----------+
|
+-----------+
|
||||||
Defer:
|
Defer:
|
||||||
synchronize_rcu() & call_rcu()
|
synchronize_rcu() & call_rcu()
|
||||||
|
|
||||||
|
|||||||
@@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time
|
|||||||
spikes which wouldn't necessarily make a dent in the time averages,
|
spikes which wouldn't necessarily make a dent in the time averages,
|
||||||
or to average trends over custom time frames.
|
or to average trends over custom time frames.
|
||||||
|
|
||||||
|
Monitoring for pressure thresholds
|
||||||
|
==================================
|
||||||
|
|
||||||
|
Users can register triggers and use poll() to be woken up when resource
|
||||||
|
pressure exceeds certain thresholds.
|
||||||
|
|
||||||
|
A trigger describes the maximum cumulative stall time over a specific
|
||||||
|
time window, e.g. 100ms of total stall time within any 500ms window to
|
||||||
|
generate a wakeup event.
|
||||||
|
|
||||||
|
To register a trigger user has to open psi interface file under
|
||||||
|
/proc/pressure/ representing the resource to be monitored and write the
|
||||||
|
desired threshold and time window. The open file descriptor should be
|
||||||
|
used to wait for trigger events using select(), poll() or epoll().
|
||||||
|
The following format is used:
|
||||||
|
|
||||||
|
<some|full> <stall amount in us> <time window in us>
|
||||||
|
|
||||||
|
For example writing "some 150000 1000000" into /proc/pressure/memory
|
||||||
|
would add 150ms threshold for partial memory stall measured within
|
||||||
|
1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
|
||||||
|
would add 50ms threshold for full io stall measured within 1sec time window.
|
||||||
|
|
||||||
|
Triggers can be set on more than one psi metric and more than one trigger
|
||||||
|
for the same psi metric can be specified. However for each trigger a separate
|
||||||
|
file descriptor is required to be able to poll it separately from others,
|
||||||
|
therefore for each trigger a separate open() syscall should be made even
|
||||||
|
when opening the same psi interface file.
|
||||||
|
|
||||||
|
Monitors activate only when system enters stall state for the monitored
|
||||||
|
psi metric and deactivates upon exit from the stall state. While system is
|
||||||
|
in the stall state psi signal growth is monitored at a rate of 10 times per
|
||||||
|
tracking window.
|
||||||
|
|
||||||
|
The kernel accepts window sizes ranging from 500ms to 10s, therefore min
|
||||||
|
monitoring update interval is 50ms and max is 1s. Min limit is set to
|
||||||
|
prevent overly frequent polling. Max limit is chosen as a high enough number
|
||||||
|
after which monitors are most likely not needed and psi averages can be used
|
||||||
|
instead.
|
||||||
|
|
||||||
|
When activated, psi monitor stays active for at least the duration of one
|
||||||
|
tracking window to avoid repeated activations/deactivations when system is
|
||||||
|
bouncing in and out of the stall state.
|
||||||
|
|
||||||
|
Notifications to the userspace are rate-limited to one per tracking window.
|
||||||
|
|
||||||
|
The trigger will de-register when the file descriptor used to define the
|
||||||
|
trigger is closed.
|
||||||
|
|
||||||
|
Userspace monitor usage example
|
||||||
|
===============================
|
||||||
|
|
||||||
|
#include <errno.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <poll.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Monitor memory partial stall with 1s tracking window size
|
||||||
|
* and 150ms threshold.
|
||||||
|
*/
|
||||||
|
int main() {
|
||||||
|
const char trig[] = "some 150000 1000000";
|
||||||
|
struct pollfd fds;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
|
||||||
|
if (fds.fd < 0) {
|
||||||
|
printf("/proc/pressure/memory open error: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fds.events = POLLPRI;
|
||||||
|
|
||||||
|
if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
|
||||||
|
printf("/proc/pressure/memory write error: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("waiting for events...\n");
|
||||||
|
while (1) {
|
||||||
|
n = poll(&fds, 1, -1);
|
||||||
|
if (n < 0) {
|
||||||
|
printf("poll error: %s\n", strerror(errno));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (fds.revents & POLLERR) {
|
||||||
|
printf("got POLLERR, event source is gone\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (fds.revents & POLLPRI) {
|
||||||
|
printf("event triggered!\n");
|
||||||
|
} else {
|
||||||
|
printf("unknown event received: 0x%x\n", fds.revents);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
Cgroup2 interface
|
Cgroup2 interface
|
||||||
=================
|
=================
|
||||||
|
|
||||||
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped
|
|||||||
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
|
into cgroups. Each subdirectory in the cgroupfs mountpoint contains
|
||||||
cpu.pressure, memory.pressure, and io.pressure files; the format is
|
cpu.pressure, memory.pressure, and io.pressure files; the format is
|
||||||
the same as the /proc/pressure/ files.
|
the same as the /proc/pressure/ files.
|
||||||
|
|
||||||
|
Per-cgroup psi monitors can be specified and used the same way as
|
||||||
|
system-wide ones.
|
||||||
|
|||||||
@@ -1,97 +0,0 @@
|
|||||||
_DSD Device Properties Usage Rules
|
|
||||||
----------------------------------
|
|
||||||
|
|
||||||
Properties, Property Sets and Property Subsets
|
|
||||||
----------------------------------------------
|
|
||||||
|
|
||||||
The _DSD (Device Specific Data) configuration object, introduced in ACPI 5.1,
|
|
||||||
allows any type of device configuration data to be provided via the ACPI
|
|
||||||
namespace. In principle, the format of the data may be arbitrary, but it has to
|
|
||||||
be identified by a UUID which must be recognized by the driver processing the
|
|
||||||
_DSD output. However, there are generic UUIDs defined for _DSD recognized by
|
|
||||||
the ACPI subsystem in the Linux kernel which automatically processes the data
|
|
||||||
packages associated with them and makes those data available to device drivers
|
|
||||||
as "device properties".
|
|
||||||
|
|
||||||
A device property is a data item consisting of a string key and a value (of a
|
|
||||||
specific type) associated with it.
|
|
||||||
|
|
||||||
In the ACPI _DSD context it is an element of the sub-package following the
|
|
||||||
generic Device Properties UUID in the _DSD return package as specified in the
|
|
||||||
Device Properties UUID definition document [1].
|
|
||||||
|
|
||||||
It also may be regarded as the definition of a key and the associated data type
|
|
||||||
that can be returned by _DSD in the Device Properties UUID sub-package for a
|
|
||||||
given device.
|
|
||||||
|
|
||||||
A property set is a collection of properties applicable to a hardware entity
|
|
||||||
like a device. In the ACPI _DSD context it is the set of all properties that
|
|
||||||
can be returned in the Device Properties UUID sub-package for the device in
|
|
||||||
question.
|
|
||||||
|
|
||||||
Property subsets are nested collections of properties. Each of them is
|
|
||||||
associated with an additional key (name) allowing the subset to be referred
|
|
||||||
to as a whole (and to be treated as a separate entity). The canonical
|
|
||||||
representation of property subsets is via the mechanism specified in the
|
|
||||||
Hierarchical Properties Extension UUID definition document [2].
|
|
||||||
|
|
||||||
Property sets may be hierarchical. That is, a property set may contain
|
|
||||||
multiple property subsets that each may contain property subsets of its
|
|
||||||
own and so on.
|
|
||||||
|
|
||||||
General Validity Rule for Property Sets
|
|
||||||
---------------------------------------
|
|
||||||
|
|
||||||
Valid property sets must follow the guidance given by the Device Properties UUID
|
|
||||||
definition document [1].
|
|
||||||
|
|
||||||
_DSD properties are intended to be used in addition to, and not instead of, the
|
|
||||||
existing mechanisms defined by the ACPI specification. Therefore, as a rule,
|
|
||||||
they should only be used if the ACPI specification does not make direct
|
|
||||||
provisions for handling the underlying use case. It generally is invalid to
|
|
||||||
return property sets which do not follow that rule from _DSD in data packages
|
|
||||||
associated with the Device Properties UUID.
|
|
||||||
|
|
||||||
Additional Considerations
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
There are cases in which, even if the general rule given above is followed in
|
|
||||||
principle, the property set may still not be regarded as a valid one.
|
|
||||||
|
|
||||||
For example, that applies to device properties which may cause kernel code
|
|
||||||
(either a device driver or a library/subsystem) to access hardware in a way
|
|
||||||
possibly leading to a conflict with AML methods in the ACPI namespace. In
|
|
||||||
particular, that may happen if the kernel code uses device properties to
|
|
||||||
manipulate hardware normally controlled by ACPI methods related to power
|
|
||||||
management, like _PSx and _DSW (for device objects) or _ON and _OFF (for power
|
|
||||||
resource objects), or by ACPI device disabling/enabling methods, like _DIS and
|
|
||||||
_SRS.
|
|
||||||
|
|
||||||
In all cases in which kernel code may do something that will confuse AML as a
|
|
||||||
result of using device properties, the device properties in question are not
|
|
||||||
suitable for the ACPI environment and consequently they cannot belong to a valid
|
|
||||||
property set.
|
|
||||||
|
|
||||||
Property Sets and Device Tree Bindings
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
It often is useful to make _DSD return property sets that follow Device Tree
|
|
||||||
bindings.
|
|
||||||
|
|
||||||
In those cases, however, the above validity considerations must be taken into
|
|
||||||
account in the first place and returning invalid property sets from _DSD must be
|
|
||||||
avoided. For this reason, it may not be possible to make _DSD return a property
|
|
||||||
set following the given DT binding literally and completely. Still, for the
|
|
||||||
sake of code re-use, it may make sense to provide as much of the configuration
|
|
||||||
data as possible in the form of device properties and complement that with an
|
|
||||||
ACPI-specific mechanism suitable for the use case at hand.
|
|
||||||
|
|
||||||
In any case, property sets following DT bindings literally should not be
|
|
||||||
expected to automatically work in the ACPI environment regardless of their
|
|
||||||
contents.
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
|
|
||||||
[1] http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf
|
|
||||||
[2] http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
Special Usage Model of the ACPI Control Method Lid Device
|
|
||||||
|
|
||||||
Copyright (C) 2016, Intel Corporation
|
|
||||||
Author: Lv Zheng <lv.zheng@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Abstract:
|
|
||||||
|
|
||||||
Platforms containing lids convey lid state (open/close) to OSPMs using a
|
|
||||||
control method lid device. To implement this, the AML tables issue
|
|
||||||
Notify(lid_device, 0x80) to notify the OSPMs whenever the lid state has
|
|
||||||
changed. The _LID control method for the lid device must be implemented to
|
|
||||||
report the "current" state of the lid as either "opened" or "closed".
|
|
||||||
|
|
||||||
For most platforms, both the _LID method and the lid notifications are
|
|
||||||
reliable. However, there are exceptions. In order to work with these
|
|
||||||
exceptional buggy platforms, special restrictions and expections should be
|
|
||||||
taken into account. This document describes the restrictions and the
|
|
||||||
expections of the Linux ACPI lid device driver.
|
|
||||||
|
|
||||||
|
|
||||||
1. Restrictions of the returning value of the _LID control method
|
|
||||||
|
|
||||||
The _LID control method is described to return the "current" lid state.
|
|
||||||
However the word of "current" has ambiguity, some buggy AML tables return
|
|
||||||
the lid state upon the last lid notification instead of returning the lid
|
|
||||||
state upon the last _LID evaluation. There won't be difference when the
|
|
||||||
_LID control method is evaluated during the runtime, the problem is its
|
|
||||||
initial returning value. When the AML tables implement this control method
|
|
||||||
with cached value, the initial returning value is likely not reliable.
|
|
||||||
There are platforms always retun "closed" as initial lid state.
|
|
||||||
|
|
||||||
2. Restrictions of the lid state change notifications
|
|
||||||
|
|
||||||
There are buggy AML tables never notifying when the lid device state is
|
|
||||||
changed to "opened". Thus the "opened" notification is not guaranteed. But
|
|
||||||
it is guaranteed that the AML tables always notify "closed" when the lid
|
|
||||||
state is changed to "closed". The "closed" notification is normally used to
|
|
||||||
trigger some system power saving operations on Windows. Since it is fully
|
|
||||||
tested, it is reliable from all AML tables.
|
|
||||||
|
|
||||||
3. Expections for the userspace users of the ACPI lid device driver
|
|
||||||
|
|
||||||
The ACPI button driver exports the lid state to the userspace via the
|
|
||||||
following file:
|
|
||||||
/proc/acpi/button/lid/LID0/state
|
|
||||||
This file actually calls the _LID control method described above. And given
|
|
||||||
the previous explanation, it is not reliable enough on some platforms. So
|
|
||||||
it is advised for the userspace program to not to solely rely on this file
|
|
||||||
to determine the actual lid state.
|
|
||||||
|
|
||||||
The ACPI button driver emits the following input event to the userspace:
|
|
||||||
SW_LID
|
|
||||||
The ACPI lid device driver is implemented to try to deliver the platform
|
|
||||||
triggered events to the userspace. However, given the fact that the buggy
|
|
||||||
firmware cannot make sure "opened"/"closed" events are paired, the ACPI
|
|
||||||
button driver uses the following 3 modes in order not to trigger issues.
|
|
||||||
|
|
||||||
If the userspace hasn't been prepared to ignore the unreliable "opened"
|
|
||||||
events and the unreliable initial state notification, Linux users can use
|
|
||||||
the following kernel parameters to handle the possible issues:
|
|
||||||
A. button.lid_init_state=method:
|
|
||||||
When this option is specified, the ACPI button driver reports the
|
|
||||||
initial lid state using the returning value of the _LID control method
|
|
||||||
and whether the "opened"/"closed" events are paired fully relies on the
|
|
||||||
firmware implementation.
|
|
||||||
This option can be used to fix some platforms where the returning value
|
|
||||||
of the _LID control method is reliable but the initial lid state
|
|
||||||
notification is missing.
|
|
||||||
This option is the default behavior during the period the userspace
|
|
||||||
isn't ready to handle the buggy AML tables.
|
|
||||||
B. button.lid_init_state=open:
|
|
||||||
When this option is specified, the ACPI button driver always reports the
|
|
||||||
initial lid state as "opened" and whether the "opened"/"closed" events
|
|
||||||
are paired fully relies on the firmware implementation.
|
|
||||||
This may fix some platforms where the returning value of the _LID
|
|
||||||
control method is not reliable and the initial lid state notification is
|
|
||||||
missing.
|
|
||||||
|
|
||||||
If the userspace has been prepared to ignore the unreliable "opened" events
|
|
||||||
and the unreliable initial state notification, Linux users should always
|
|
||||||
use the following kernel parameter:
|
|
||||||
C. button.lid_init_state=ignore:
|
|
||||||
When this option is specified, the ACPI button driver never reports the
|
|
||||||
initial lid state and there is a compensation mechanism implemented to
|
|
||||||
ensure that the reliable "closed" notifications can always be delievered
|
|
||||||
to the userspace by always pairing "closed" input events with complement
|
|
||||||
"opened" input events. But there is still no guarantee that the "opened"
|
|
||||||
notifications can be delivered to the userspace when the lid is actually
|
|
||||||
opens given that some AML tables do not send "opened" notifications
|
|
||||||
reliably.
|
|
||||||
In this mode, if everything is correctly implemented by the platform
|
|
||||||
firmware, the old userspace programs should still work. Otherwise, the
|
|
||||||
new userspace programs are required to work with the ACPI button driver.
|
|
||||||
This option will be the default behavior after the userspace is ready to
|
|
||||||
handle the buggy AML tables.
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
The AML Debugger
|
|
||||||
|
|
||||||
Copyright (C) 2016, Intel Corporation
|
|
||||||
Author: Lv Zheng <lv.zheng@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
This document describes the usage of the AML debugger embedded in the Linux
|
|
||||||
kernel.
|
|
||||||
|
|
||||||
1. Build the debugger
|
|
||||||
|
|
||||||
The following kernel configuration items are required to enable the AML
|
|
||||||
debugger interface from the Linux kernel:
|
|
||||||
|
|
||||||
CONFIG_ACPI_DEBUGGER=y
|
|
||||||
CONFIG_ACPI_DEBUGGER_USER=m
|
|
||||||
|
|
||||||
The userspace utilities can be built from the kernel source tree using
|
|
||||||
the following commands:
|
|
||||||
|
|
||||||
$ cd tools
|
|
||||||
$ make acpi
|
|
||||||
|
|
||||||
The resultant userspace tool binary is then located at:
|
|
||||||
|
|
||||||
tools/power/acpi/acpidbg
|
|
||||||
|
|
||||||
It can be installed to system directories by running "make install" (as a
|
|
||||||
sufficiently privileged user).
|
|
||||||
|
|
||||||
2. Start the userspace debugger interface
|
|
||||||
|
|
||||||
After booting the kernel with the debugger built-in, the debugger can be
|
|
||||||
started by using the following commands:
|
|
||||||
|
|
||||||
# mount -t debugfs none /sys/kernel/debug
|
|
||||||
# modprobe acpi_dbg
|
|
||||||
# tools/power/acpi/acpidbg
|
|
||||||
|
|
||||||
That spawns the interactive AML debugger environment where you can execute
|
|
||||||
debugger commands.
|
|
||||||
|
|
||||||
The commands are documented in the "ACPICA Overview and Programmer Reference"
|
|
||||||
that can be downloaded from
|
|
||||||
|
|
||||||
https://acpica.org/documentation
|
|
||||||
|
|
||||||
The detailed debugger commands reference is located in Chapter 12 "ACPICA
|
|
||||||
Debugger Reference". The "help" command can be used for a quick reference.
|
|
||||||
|
|
||||||
3. Stop the userspace debugger interface
|
|
||||||
|
|
||||||
The interactive debugger interface can be closed by pressing Ctrl+C or using
|
|
||||||
the "quit" or "exit" commands. When finished, unload the module with:
|
|
||||||
|
|
||||||
# rmmod acpi_dbg
|
|
||||||
|
|
||||||
The module unloading may fail if there is an acpidbg instance running.
|
|
||||||
|
|
||||||
4. Run the debugger in a script
|
|
||||||
|
|
||||||
It may be useful to run the AML debugger in a test script. "acpidbg" supports
|
|
||||||
this in a special "batch" mode. For example, the following command outputs
|
|
||||||
the entire ACPI namespace:
|
|
||||||
|
|
||||||
# acpidbg -b "namespace"
|
|
||||||
@@ -1,177 +0,0 @@
|
|||||||
APEI Error INJection
|
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
EINJ provides a hardware error injection mechanism. It is very useful
|
|
||||||
for debugging and testing APEI and RAS features in general.
|
|
||||||
|
|
||||||
You need to check whether your BIOS supports EINJ first. For that, look
|
|
||||||
for early boot messages similar to this one:
|
|
||||||
|
|
||||||
ACPI: EINJ 0x000000007370A000 000150 (v01 INTEL 00000001 INTL 00000001)
|
|
||||||
|
|
||||||
which shows that the BIOS is exposing an EINJ table - it is the
|
|
||||||
mechanism through which the injection is done.
|
|
||||||
|
|
||||||
Alternatively, look in /sys/firmware/acpi/tables for an "EINJ" file,
|
|
||||||
which is a different representation of the same thing.
|
|
||||||
|
|
||||||
It doesn't necessarily mean that EINJ is not supported if those above
|
|
||||||
don't exist: before you give up, go into BIOS setup to see if the BIOS
|
|
||||||
has an option to enable error injection. Look for something called WHEA
|
|
||||||
or similar. Often, you need to enable an ACPI5 support option prior, in
|
|
||||||
order to see the APEI,EINJ,... functionality supported and exposed by
|
|
||||||
the BIOS menu.
|
|
||||||
|
|
||||||
To use EINJ, make sure the following are options enabled in your kernel
|
|
||||||
configuration:
|
|
||||||
|
|
||||||
CONFIG_DEBUG_FS
|
|
||||||
CONFIG_ACPI_APEI
|
|
||||||
CONFIG_ACPI_APEI_EINJ
|
|
||||||
|
|
||||||
The EINJ user interface is in <debugfs mount point>/apei/einj.
|
|
||||||
|
|
||||||
The following files belong to it:
|
|
||||||
|
|
||||||
- available_error_type
|
|
||||||
|
|
||||||
This file shows which error types are supported:
|
|
||||||
|
|
||||||
Error Type Value Error Description
|
|
||||||
================ =================
|
|
||||||
0x00000001 Processor Correctable
|
|
||||||
0x00000002 Processor Uncorrectable non-fatal
|
|
||||||
0x00000004 Processor Uncorrectable fatal
|
|
||||||
0x00000008 Memory Correctable
|
|
||||||
0x00000010 Memory Uncorrectable non-fatal
|
|
||||||
0x00000020 Memory Uncorrectable fatal
|
|
||||||
0x00000040 PCI Express Correctable
|
|
||||||
0x00000080 PCI Express Uncorrectable fatal
|
|
||||||
0x00000100 PCI Express Uncorrectable non-fatal
|
|
||||||
0x00000200 Platform Correctable
|
|
||||||
0x00000400 Platform Uncorrectable non-fatal
|
|
||||||
0x00000800 Platform Uncorrectable fatal
|
|
||||||
|
|
||||||
The format of the file contents are as above, except present are only
|
|
||||||
the available error types.
|
|
||||||
|
|
||||||
- error_type
|
|
||||||
|
|
||||||
Set the value of the error type being injected. Possible error types
|
|
||||||
are defined in the file available_error_type above.
|
|
||||||
|
|
||||||
- error_inject
|
|
||||||
|
|
||||||
Write any integer to this file to trigger the error injection. Make
|
|
||||||
sure you have specified all necessary error parameters, i.e. this
|
|
||||||
write should be the last step when injecting errors.
|
|
||||||
|
|
||||||
- flags
|
|
||||||
|
|
||||||
Present for kernel versions 3.13 and above. Used to specify which
|
|
||||||
of param{1..4} are valid and should be used by the firmware during
|
|
||||||
injection. Value is a bitmask as specified in ACPI5.0 spec for the
|
|
||||||
SET_ERROR_TYPE_WITH_ADDRESS data structure:
|
|
||||||
|
|
||||||
Bit 0 - Processor APIC field valid (see param3 below).
|
|
||||||
Bit 1 - Memory address and mask valid (param1 and param2).
|
|
||||||
Bit 2 - PCIe (seg,bus,dev,fn) valid (see param4 below).
|
|
||||||
|
|
||||||
If set to zero, legacy behavior is mimicked where the type of
|
|
||||||
injection specifies just one bit set, and param1 is multiplexed.
|
|
||||||
|
|
||||||
- param1
|
|
||||||
|
|
||||||
This file is used to set the first error parameter value. Its effect
|
|
||||||
depends on the error type specified in error_type. For example, if
|
|
||||||
error type is memory related type, the param1 should be a valid
|
|
||||||
physical memory address. [Unless "flag" is set - see above]
|
|
||||||
|
|
||||||
- param2
|
|
||||||
|
|
||||||
Same use as param1 above. For example, if error type is of memory
|
|
||||||
related type, then param2 should be a physical memory address mask.
|
|
||||||
Linux requires page or narrower granularity, say, 0xfffffffffffff000.
|
|
||||||
|
|
||||||
- param3
|
|
||||||
|
|
||||||
Used when the 0x1 bit is set in "flags" to specify the APIC id
|
|
||||||
|
|
||||||
- param4
|
|
||||||
Used when the 0x4 bit is set in "flags" to specify target PCIe device
|
|
||||||
|
|
||||||
- notrigger
|
|
||||||
|
|
||||||
The error injection mechanism is a two-step process. First inject the
|
|
||||||
error, then perform some actions to trigger it. Setting "notrigger"
|
|
||||||
to 1 skips the trigger phase, which *may* allow the user to cause the
|
|
||||||
error in some other context by a simple access to the CPU, memory
|
|
||||||
location, or device that is the target of the error injection. Whether
|
|
||||||
this actually works depends on what operations the BIOS actually
|
|
||||||
includes in the trigger phase.
|
|
||||||
|
|
||||||
BIOS versions based on the ACPI 4.0 specification have limited options
|
|
||||||
in controlling where the errors are injected. Your BIOS may support an
|
|
||||||
extension (enabled with the param_extension=1 module parameter, or boot
|
|
||||||
command line einj.param_extension=1). This allows the address and mask
|
|
||||||
for memory injections to be specified by the param1 and param2 files in
|
|
||||||
apei/einj.
|
|
||||||
|
|
||||||
BIOS versions based on the ACPI 5.0 specification have more control over
|
|
||||||
the target of the injection. For processor-related errors (type 0x1, 0x2
|
|
||||||
and 0x4), you can set flags to 0x3 (param3 for bit 0, and param1 and
|
|
||||||
param2 for bit 1) so that you have more information added to the error
|
|
||||||
signature being injected. The actual data passed is this:
|
|
||||||
|
|
||||||
memory_address = param1;
|
|
||||||
memory_address_range = param2;
|
|
||||||
apicid = param3;
|
|
||||||
pcie_sbdf = param4;
|
|
||||||
|
|
||||||
For memory errors (type 0x8, 0x10 and 0x20) the address is set using
|
|
||||||
param1 with a mask in param2 (0x0 is equivalent to all ones). For PCI
|
|
||||||
express errors (type 0x40, 0x80 and 0x100) the segment, bus, device and
|
|
||||||
function are specified using param1:
|
|
||||||
|
|
||||||
31 24 23 16 15 11 10 8 7 0
|
|
||||||
+-------------------------------------------------+
|
|
||||||
| segment | bus | device | function | reserved |
|
|
||||||
+-------------------------------------------------+
|
|
||||||
|
|
||||||
Anyway, you get the idea, if there's doubt just take a look at the code
|
|
||||||
in drivers/acpi/apei/einj.c.
|
|
||||||
|
|
||||||
An ACPI 5.0 BIOS may also allow vendor-specific errors to be injected.
|
|
||||||
In this case a file named vendor will contain identifying information
|
|
||||||
from the BIOS that hopefully will allow an application wishing to use
|
|
||||||
the vendor-specific extension to tell that they are running on a BIOS
|
|
||||||
that supports it. All vendor extensions have the 0x80000000 bit set in
|
|
||||||
error_type. A file vendor_flags controls the interpretation of param1
|
|
||||||
and param2 (1 = PROCESSOR, 2 = MEMORY, 4 = PCI). See your BIOS vendor
|
|
||||||
documentation for details (and expect changes to this API if vendors
|
|
||||||
creativity in using this feature expands beyond our expectations).
|
|
||||||
|
|
||||||
|
|
||||||
An error injection example:
|
|
||||||
|
|
||||||
# cd /sys/kernel/debug/apei/einj
|
|
||||||
# cat available_error_type # See which errors can be injected
|
|
||||||
0x00000002 Processor Uncorrectable non-fatal
|
|
||||||
0x00000008 Memory Correctable
|
|
||||||
0x00000010 Memory Uncorrectable non-fatal
|
|
||||||
# echo 0x12345000 > param1 # Set memory address for injection
|
|
||||||
# echo $((-1 << 12)) > param2 # Mask 0xfffffffffffff000 - anywhere in this page
|
|
||||||
# echo 0x8 > error_type # Choose correctable memory error
|
|
||||||
# echo 1 > error_inject # Inject now
|
|
||||||
|
|
||||||
You should see something like this in dmesg:
|
|
||||||
|
|
||||||
[22715.830801] EDAC sbridge MC3: HANDLING MCE MEMORY ERROR
|
|
||||||
[22715.834759] EDAC sbridge MC3: CPU 0: Machine Check Event: 0 Bank 7: 8c00004000010090
|
|
||||||
[22715.834759] EDAC sbridge MC3: TSC 0
|
|
||||||
[22715.834759] EDAC sbridge MC3: ADDR 12345000 EDAC sbridge MC3: MISC 144780c86
|
|
||||||
[22715.834759] EDAC sbridge MC3: PROCESSOR 0:306e7 TIME 1422553404 SOCKET 0 APIC 0
|
|
||||||
[22716.616173] EDAC MC3: 1 CE memory read error on CPU_SrcID#0_Channel#0_DIMM#0 (channel:0 slot:0 page:0x12345 offset:0x0 grain:32 syndrome:0x0 - area:DRAM err_code:0001:0090 socket:0 channel_mask:1 rank:0)
|
|
||||||
|
|
||||||
For more information about EINJ, please refer to ACPI specification
|
|
||||||
version 4.0, section 17.5 and ACPI 5.0, section 18.6.
|
|
||||||
@@ -1,147 +0,0 @@
|
|||||||
APEI output format
|
|
||||||
~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
APEI uses printk as hardware error reporting interface, the output
|
|
||||||
format is as follow.
|
|
||||||
|
|
||||||
<error record> :=
|
|
||||||
APEI generic hardware error status
|
|
||||||
severity: <integer>, <severity string>
|
|
||||||
section: <integer>, severity: <integer>, <severity string>
|
|
||||||
flags: <integer>
|
|
||||||
<section flags strings>
|
|
||||||
fru_id: <uuid string>
|
|
||||||
fru_text: <string>
|
|
||||||
section_type: <section type string>
|
|
||||||
<section data>
|
|
||||||
|
|
||||||
<severity string>* := recoverable | fatal | corrected | info
|
|
||||||
|
|
||||||
<section flags strings># :=
|
|
||||||
[primary][, containment warning][, reset][, threshold exceeded]\
|
|
||||||
[, resource not accessible][, latent error]
|
|
||||||
|
|
||||||
<section type string> := generic processor error | memory error | \
|
|
||||||
PCIe error | unknown, <uuid string>
|
|
||||||
|
|
||||||
<section data> :=
|
|
||||||
<generic processor section data> | <memory section data> | \
|
|
||||||
<pcie section data> | <null>
|
|
||||||
|
|
||||||
<generic processor section data> :=
|
|
||||||
[processor_type: <integer>, <proc type string>]
|
|
||||||
[processor_isa: <integer>, <proc isa string>]
|
|
||||||
[error_type: <integer>
|
|
||||||
<proc error type strings>]
|
|
||||||
[operation: <integer>, <proc operation string>]
|
|
||||||
[flags: <integer>
|
|
||||||
<proc flags strings>]
|
|
||||||
[level: <integer>]
|
|
||||||
[version_info: <integer>]
|
|
||||||
[processor_id: <integer>]
|
|
||||||
[target_address: <integer>]
|
|
||||||
[requestor_id: <integer>]
|
|
||||||
[responder_id: <integer>]
|
|
||||||
[IP: <integer>]
|
|
||||||
|
|
||||||
<proc type string>* := IA32/X64 | IA64
|
|
||||||
|
|
||||||
<proc isa string>* := IA32 | IA64 | X64
|
|
||||||
|
|
||||||
<processor error type strings># :=
|
|
||||||
[cache error][, TLB error][, bus error][, micro-architectural error]
|
|
||||||
|
|
||||||
<proc operation string>* := unknown or generic | data read | data write | \
|
|
||||||
instruction execution
|
|
||||||
|
|
||||||
<proc flags strings># :=
|
|
||||||
[restartable][, precise IP][, overflow][, corrected]
|
|
||||||
|
|
||||||
<memory section data> :=
|
|
||||||
[error_status: <integer>]
|
|
||||||
[physical_address: <integer>]
|
|
||||||
[physical_address_mask: <integer>]
|
|
||||||
[node: <integer>]
|
|
||||||
[card: <integer>]
|
|
||||||
[module: <integer>]
|
|
||||||
[bank: <integer>]
|
|
||||||
[device: <integer>]
|
|
||||||
[row: <integer>]
|
|
||||||
[column: <integer>]
|
|
||||||
[bit_position: <integer>]
|
|
||||||
[requestor_id: <integer>]
|
|
||||||
[responder_id: <integer>]
|
|
||||||
[target_id: <integer>]
|
|
||||||
[error_type: <integer>, <mem error type string>]
|
|
||||||
|
|
||||||
<mem error type string>* :=
|
|
||||||
unknown | no error | single-bit ECC | multi-bit ECC | \
|
|
||||||
single-symbol chipkill ECC | multi-symbol chipkill ECC | master abort | \
|
|
||||||
target abort | parity error | watchdog timeout | invalid address | \
|
|
||||||
mirror Broken | memory sparing | scrub corrected error | \
|
|
||||||
scrub uncorrected error
|
|
||||||
|
|
||||||
<pcie section data> :=
|
|
||||||
[port_type: <integer>, <pcie port type string>]
|
|
||||||
[version: <integer>.<integer>]
|
|
||||||
[command: <integer>, status: <integer>]
|
|
||||||
[device_id: <integer>:<integer>:<integer>.<integer>
|
|
||||||
slot: <integer>
|
|
||||||
secondary_bus: <integer>
|
|
||||||
vendor_id: <integer>, device_id: <integer>
|
|
||||||
class_code: <integer>]
|
|
||||||
[serial number: <integer>, <integer>]
|
|
||||||
[bridge: secondary_status: <integer>, control: <integer>]
|
|
||||||
[aer_status: <integer>, aer_mask: <integer>
|
|
||||||
<aer status string>
|
|
||||||
[aer_uncor_severity: <integer>]
|
|
||||||
aer_layer=<aer layer string>, aer_agent=<aer agent string>
|
|
||||||
aer_tlp_header: <integer> <integer> <integer> <integer>]
|
|
||||||
|
|
||||||
<pcie port type string>* := PCIe end point | legacy PCI end point | \
|
|
||||||
unknown | unknown | root port | upstream switch port | \
|
|
||||||
downstream switch port | PCIe to PCI/PCI-X bridge | \
|
|
||||||
PCI/PCI-X to PCIe bridge | root complex integrated endpoint device | \
|
|
||||||
root complex event collector
|
|
||||||
|
|
||||||
if section severity is fatal or recoverable
|
|
||||||
<aer status string># :=
|
|
||||||
unknown | unknown | unknown | unknown | Data Link Protocol | \
|
|
||||||
unknown | unknown | unknown | unknown | unknown | unknown | unknown | \
|
|
||||||
Poisoned TLP | Flow Control Protocol | Completion Timeout | \
|
|
||||||
Completer Abort | Unexpected Completion | Receiver Overflow | \
|
|
||||||
Malformed TLP | ECRC | Unsupported Request
|
|
||||||
else
|
|
||||||
<aer status string># :=
|
|
||||||
Receiver Error | unknown | unknown | unknown | unknown | unknown | \
|
|
||||||
Bad TLP | Bad DLLP | RELAY_NUM Rollover | unknown | unknown | unknown | \
|
|
||||||
Replay Timer Timeout | Advisory Non-Fatal
|
|
||||||
fi
|
|
||||||
|
|
||||||
<aer layer string> :=
|
|
||||||
Physical Layer | Data Link Layer | Transaction Layer
|
|
||||||
|
|
||||||
<aer agent string> :=
|
|
||||||
Receiver ID | Requester ID | Completer ID | Transmitter ID
|
|
||||||
|
|
||||||
Where, [] designate corresponding content is optional
|
|
||||||
|
|
||||||
All <field string> description with * has the following format:
|
|
||||||
|
|
||||||
field: <integer>, <field string>
|
|
||||||
|
|
||||||
Where value of <integer> should be the position of "string" in <field
|
|
||||||
string> description. Otherwise, <field string> will be "unknown".
|
|
||||||
|
|
||||||
All <field strings> description with # has the following format:
|
|
||||||
|
|
||||||
field: <integer>
|
|
||||||
<field strings>
|
|
||||||
|
|
||||||
Where each string in <fields strings> corresponding to one set bit of
|
|
||||||
<integer>. The bit position is the position of "string" in <field
|
|
||||||
strings> description.
|
|
||||||
|
|
||||||
For more detailed explanation of every field, please refer to UEFI
|
|
||||||
specification version 2.3 or later, section Appendix N: Common
|
|
||||||
Platform Error Record.
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
|
|
||||||
Collaborative Processor Performance Control (CPPC)
|
|
||||||
|
|
||||||
CPPC defined in the ACPI spec describes a mechanism for the OS to manage the
|
|
||||||
performance of a logical processor on a contigious and abstract performance
|
|
||||||
scale. CPPC exposes a set of registers to describe abstract performance scale,
|
|
||||||
to request performance levels and to measure per-cpu delivered performance.
|
|
||||||
|
|
||||||
For more details on CPPC please refer to the ACPI specification at:
|
|
||||||
|
|
||||||
http://uefi.org/specifications
|
|
||||||
|
|
||||||
Some of the CPPC registers are exposed via sysfs under:
|
|
||||||
|
|
||||||
/sys/devices/system/cpu/cpuX/acpi_cppc/
|
|
||||||
|
|
||||||
for each cpu X
|
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
$ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/
|
|
||||||
/sys/devices/system/cpu/cpu0/acpi_cppc/:
|
|
||||||
total 0
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf
|
|
||||||
-r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time
|
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
* highest_perf : Highest performance of this processor (abstract scale).
|
|
||||||
* nominal_perf : Highest sustained performance of this processor (abstract scale).
|
|
||||||
* lowest_nonlinear_perf : Lowest performance of this processor with nonlinear
|
|
||||||
power savings (abstract scale).
|
|
||||||
* lowest_perf : Lowest performance of this processor (abstract scale).
|
|
||||||
|
|
||||||
* lowest_freq : CPU frequency corresponding to lowest_perf (in MHz).
|
|
||||||
* nominal_freq : CPU frequency corresponding to nominal_perf (in MHz).
|
|
||||||
The above frequencies should only be used to report processor performance in
|
|
||||||
freqency instead of abstract scale. These values should not be used for any
|
|
||||||
functional decisions.
|
|
||||||
|
|
||||||
* feedback_ctrs : Includes both Reference and delivered performance counter.
|
|
||||||
Reference counter ticks up proportional to processor's reference performance.
|
|
||||||
Delivered counter ticks up proportional to processor's delivered performance.
|
|
||||||
* wraparound_time: Minimum time for the feedback counters to wraparound (seconds).
|
|
||||||
* reference_perf : Performance level at which reference performance counter
|
|
||||||
accumulates (abstract scale).
|
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
Computing Average Delivered Performance
|
|
||||||
|
|
||||||
Below describes the steps to compute the average performance delivered by taking
|
|
||||||
two different snapshots of feedback counters at time T1 and T2.
|
|
||||||
|
|
||||||
T1: Read feedback_ctrs as fbc_t1
|
|
||||||
Wait or run some workload
|
|
||||||
T2: Read feedback_ctrs as fbc_t2
|
|
||||||
|
|
||||||
delivered_counter_delta = fbc_t2[del] - fbc_t1[del]
|
|
||||||
reference_counter_delta = fbc_t2[ref] - fbc_t1[ref]
|
|
||||||
|
|
||||||
delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta
|
|
||||||
@@ -1,148 +0,0 @@
|
|||||||
ACPI Debug Output
|
|
||||||
|
|
||||||
|
|
||||||
The ACPI CA, the Linux ACPI core, and some ACPI drivers can generate debug
|
|
||||||
output. This document describes how to use this facility.
|
|
||||||
|
|
||||||
Compile-time configuration
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
ACPI debug output is globally enabled by CONFIG_ACPI_DEBUG. If this config
|
|
||||||
option is turned off, the debug messages are not even built into the
|
|
||||||
kernel.
|
|
||||||
|
|
||||||
Boot- and run-time configuration
|
|
||||||
--------------------------------
|
|
||||||
|
|
||||||
When CONFIG_ACPI_DEBUG=y, you can select the component and level of messages
|
|
||||||
you're interested in. At boot-time, use the acpi.debug_layer and
|
|
||||||
acpi.debug_level kernel command line options. After boot, you can use the
|
|
||||||
debug_layer and debug_level files in /sys/module/acpi/parameters/ to control
|
|
||||||
the debug messages.
|
|
||||||
|
|
||||||
debug_layer (component)
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
The "debug_layer" is a mask that selects components of interest, e.g., a
|
|
||||||
specific driver or part of the ACPI interpreter. To build the debug_layer
|
|
||||||
bitmask, look for the "#define _COMPONENT" in an ACPI source file.
|
|
||||||
|
|
||||||
You can set the debug_layer mask at boot-time using the acpi.debug_layer
|
|
||||||
command line argument, and you can change it after boot by writing values
|
|
||||||
to /sys/module/acpi/parameters/debug_layer.
|
|
||||||
|
|
||||||
The possible components are defined in include/acpi/acoutput.h and
|
|
||||||
include/acpi/acpi_drivers.h. Reading /sys/module/acpi/parameters/debug_layer
|
|
||||||
shows the supported mask values, currently these:
|
|
||||||
|
|
||||||
ACPI_UTILITIES 0x00000001
|
|
||||||
ACPI_HARDWARE 0x00000002
|
|
||||||
ACPI_EVENTS 0x00000004
|
|
||||||
ACPI_TABLES 0x00000008
|
|
||||||
ACPI_NAMESPACE 0x00000010
|
|
||||||
ACPI_PARSER 0x00000020
|
|
||||||
ACPI_DISPATCHER 0x00000040
|
|
||||||
ACPI_EXECUTER 0x00000080
|
|
||||||
ACPI_RESOURCES 0x00000100
|
|
||||||
ACPI_CA_DEBUGGER 0x00000200
|
|
||||||
ACPI_OS_SERVICES 0x00000400
|
|
||||||
ACPI_CA_DISASSEMBLER 0x00000800
|
|
||||||
ACPI_COMPILER 0x00001000
|
|
||||||
ACPI_TOOLS 0x00002000
|
|
||||||
ACPI_BUS_COMPONENT 0x00010000
|
|
||||||
ACPI_AC_COMPONENT 0x00020000
|
|
||||||
ACPI_BATTERY_COMPONENT 0x00040000
|
|
||||||
ACPI_BUTTON_COMPONENT 0x00080000
|
|
||||||
ACPI_SBS_COMPONENT 0x00100000
|
|
||||||
ACPI_FAN_COMPONENT 0x00200000
|
|
||||||
ACPI_PCI_COMPONENT 0x00400000
|
|
||||||
ACPI_POWER_COMPONENT 0x00800000
|
|
||||||
ACPI_CONTAINER_COMPONENT 0x01000000
|
|
||||||
ACPI_SYSTEM_COMPONENT 0x02000000
|
|
||||||
ACPI_THERMAL_COMPONENT 0x04000000
|
|
||||||
ACPI_MEMORY_DEVICE_COMPONENT 0x08000000
|
|
||||||
ACPI_VIDEO_COMPONENT 0x10000000
|
|
||||||
ACPI_PROCESSOR_COMPONENT 0x20000000
|
|
||||||
|
|
||||||
debug_level
|
|
||||||
-----------
|
|
||||||
|
|
||||||
The "debug_level" is a mask that selects different types of messages, e.g.,
|
|
||||||
those related to initialization, method execution, informational messages, etc.
|
|
||||||
To build debug_level, look at the level specified in an ACPI_DEBUG_PRINT()
|
|
||||||
statement.
|
|
||||||
|
|
||||||
The ACPI interpreter uses several different levels, but the Linux
|
|
||||||
ACPI core and ACPI drivers generally only use ACPI_LV_INFO.
|
|
||||||
|
|
||||||
You can set the debug_level mask at boot-time using the acpi.debug_level
|
|
||||||
command line argument, and you can change it after boot by writing values
|
|
||||||
to /sys/module/acpi/parameters/debug_level.
|
|
||||||
|
|
||||||
The possible levels are defined in include/acpi/acoutput.h. Reading
|
|
||||||
/sys/module/acpi/parameters/debug_level shows the supported mask values,
|
|
||||||
currently these:
|
|
||||||
|
|
||||||
ACPI_LV_INIT 0x00000001
|
|
||||||
ACPI_LV_DEBUG_OBJECT 0x00000002
|
|
||||||
ACPI_LV_INFO 0x00000004
|
|
||||||
ACPI_LV_INIT_NAMES 0x00000020
|
|
||||||
ACPI_LV_PARSE 0x00000040
|
|
||||||
ACPI_LV_LOAD 0x00000080
|
|
||||||
ACPI_LV_DISPATCH 0x00000100
|
|
||||||
ACPI_LV_EXEC 0x00000200
|
|
||||||
ACPI_LV_NAMES 0x00000400
|
|
||||||
ACPI_LV_OPREGION 0x00000800
|
|
||||||
ACPI_LV_BFIELD 0x00001000
|
|
||||||
ACPI_LV_TABLES 0x00002000
|
|
||||||
ACPI_LV_VALUES 0x00004000
|
|
||||||
ACPI_LV_OBJECTS 0x00008000
|
|
||||||
ACPI_LV_RESOURCES 0x00010000
|
|
||||||
ACPI_LV_USER_REQUESTS 0x00020000
|
|
||||||
ACPI_LV_PACKAGE 0x00040000
|
|
||||||
ACPI_LV_ALLOCATIONS 0x00100000
|
|
||||||
ACPI_LV_FUNCTIONS 0x00200000
|
|
||||||
ACPI_LV_OPTIMIZATIONS 0x00400000
|
|
||||||
ACPI_LV_MUTEX 0x01000000
|
|
||||||
ACPI_LV_THREADS 0x02000000
|
|
||||||
ACPI_LV_IO 0x04000000
|
|
||||||
ACPI_LV_INTERRUPTS 0x08000000
|
|
||||||
ACPI_LV_AML_DISASSEMBLE 0x10000000
|
|
||||||
ACPI_LV_VERBOSE_INFO 0x20000000
|
|
||||||
ACPI_LV_FULL_TABLES 0x40000000
|
|
||||||
ACPI_LV_EVENTS 0x80000000
|
|
||||||
|
|
||||||
Examples
|
|
||||||
--------
|
|
||||||
|
|
||||||
For example, drivers/acpi/bus.c contains this:
|
|
||||||
|
|
||||||
#define _COMPONENT ACPI_BUS_COMPONENT
|
|
||||||
...
|
|
||||||
ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device insertion detected\n"));
|
|
||||||
|
|
||||||
To turn on this message, set the ACPI_BUS_COMPONENT bit in acpi.debug_layer
|
|
||||||
and the ACPI_LV_INFO bit in acpi.debug_level. (The ACPI_DEBUG_PRINT
|
|
||||||
statement uses ACPI_DB_INFO, which is macro based on the ACPI_LV_INFO
|
|
||||||
definition.)
|
|
||||||
|
|
||||||
Enable all AML "Debug" output (stores to the Debug object while interpreting
|
|
||||||
AML) during boot:
|
|
||||||
|
|
||||||
acpi.debug_layer=0xffffffff acpi.debug_level=0x2
|
|
||||||
|
|
||||||
Enable PCI and PCI interrupt routing debug messages:
|
|
||||||
|
|
||||||
acpi.debug_layer=0x400000 acpi.debug_level=0x4
|
|
||||||
|
|
||||||
Enable all ACPI hardware-related messages:
|
|
||||||
|
|
||||||
acpi.debug_layer=0x2 acpi.debug_level=0xffffffff
|
|
||||||
|
|
||||||
Enable all ACPI_DB_INFO messages after boot:
|
|
||||||
|
|
||||||
# echo 0x4 > /sys/module/acpi/parameters/debug_level
|
|
||||||
|
|
||||||
Show all valid component values:
|
|
||||||
|
|
||||||
# cat /sys/module/acpi/parameters/debug_layer
|
|
||||||
@@ -1,89 +0,0 @@
|
|||||||
Copyright (C) 2018 Intel Corporation
|
|
||||||
Author: Sakari Ailus <sakari.ailus@linux.intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Referencing hierarchical data nodes
|
|
||||||
-----------------------------------
|
|
||||||
|
|
||||||
ACPI in general allows referring to device objects in the tree only.
|
|
||||||
Hierarchical data extension nodes may not be referred to directly, hence this
|
|
||||||
document defines a scheme to implement such references.
|
|
||||||
|
|
||||||
A reference consist of the device object name followed by one or more
|
|
||||||
hierarchical data extension [1] keys. Specifically, the hierarchical data
|
|
||||||
extension node which is referred to by the key shall lie directly under the
|
|
||||||
parent object i.e. either the device object or another hierarchical data
|
|
||||||
extension node.
|
|
||||||
|
|
||||||
The keys in the hierarchical data nodes shall consist of the name of the node,
|
|
||||||
"@" character and the number of the node in hexadecimal notation (without pre-
|
|
||||||
or postfixes). The same ACPI object shall include the _DSD property extension
|
|
||||||
with a property "reg" that shall have the same numerical value as the number of
|
|
||||||
the node.
|
|
||||||
|
|
||||||
In case a hierarchical data extensions node has no numerical value, then the
|
|
||||||
"reg" property shall be omitted from the ACPI object's _DSD properties and the
|
|
||||||
"@" character and the number shall be omitted from the hierarchical data
|
|
||||||
extension key.
|
|
||||||
|
|
||||||
|
|
||||||
Example
|
|
||||||
-------
|
|
||||||
|
|
||||||
In the ASL snippet below, the "reference" _DSD property [2] contains a
|
|
||||||
device object reference to DEV0 and under that device object, a
|
|
||||||
hierarchical data extension key "node@1" referring to the NOD1 object
|
|
||||||
and lastly, a hierarchical data extension key "anothernode" referring to
|
|
||||||
the ANOD object which is also the final target node of the reference.
|
|
||||||
|
|
||||||
Device (DEV0)
|
|
||||||
{
|
|
||||||
Name (_DSD, Package () {
|
|
||||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
|
||||||
Package () {
|
|
||||||
Package () { "node@0", NOD0 },
|
|
||||||
Package () { "node@1", NOD1 },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Name (NOD0, Package() {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "random-property", 3 },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Name (NOD1, Package() {
|
|
||||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
|
||||||
Package () {
|
|
||||||
Package () { "anothernode", ANOD },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Name (ANOD, Package() {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "random-property", 0 },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
Device (DEV1)
|
|
||||||
{
|
|
||||||
Name (_DSD, Package () {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "reference", ^DEV0, "node@1", "anothernode" },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
Please also see a graph example in graph.txt .
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
|
|
||||||
[1] Hierarchical Data Extension UUID For _DSD.
|
|
||||||
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
|
|
||||||
referenced 2018-07-17.
|
|
||||||
|
|
||||||
[2] Device Properties UUID For _DSD.
|
|
||||||
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
|
|
||||||
referenced 2016-10-04.
|
|
||||||
@@ -1,174 +0,0 @@
|
|||||||
Graphs
|
|
||||||
|
|
||||||
|
|
||||||
_DSD
|
|
||||||
----
|
|
||||||
|
|
||||||
_DSD (Device Specific Data) [7] is a predefined ACPI device
|
|
||||||
configuration object that can be used to convey information on
|
|
||||||
hardware features which are not specifically covered by the ACPI
|
|
||||||
specification [1][6]. There are two _DSD extensions that are relevant
|
|
||||||
for graphs: property [4] and hierarchical data extensions [5]. The
|
|
||||||
property extension provides generic key-value pairs whereas the
|
|
||||||
hierarchical data extension supports nodes with references to other
|
|
||||||
nodes, forming a tree. The nodes in the tree may contain properties as
|
|
||||||
defined by the property extension. The two extensions together provide
|
|
||||||
a tree-like structure with zero or more properties (key-value pairs)
|
|
||||||
in each node of the tree.
|
|
||||||
|
|
||||||
The data structure may be accessed at runtime by using the device_*
|
|
||||||
and fwnode_* functions defined in include/linux/fwnode.h .
|
|
||||||
|
|
||||||
Fwnode represents a generic firmware node object. It is independent on
|
|
||||||
the firmware type. In ACPI, fwnodes are _DSD hierarchical data
|
|
||||||
extensions objects. A device's _DSD object is represented by an
|
|
||||||
fwnode.
|
|
||||||
|
|
||||||
The data structure may be referenced to elsewhere in the ACPI tables
|
|
||||||
by using a hard reference to the device itself and an index to the
|
|
||||||
hierarchical data extension array on each depth.
|
|
||||||
|
|
||||||
|
|
||||||
Ports and endpoints
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
The port and endpoint concepts are very similar to those in Devicetree
|
|
||||||
[3]. A port represents an interface in a device, and an endpoint
|
|
||||||
represents a connection to that interface.
|
|
||||||
|
|
||||||
All port nodes are located under the device's "_DSD" node in the hierarchical
|
|
||||||
data extension tree. The data extension related to each port node must begin
|
|
||||||
with "port" and must be followed by the "@" character and the number of the port
|
|
||||||
as its key. The target object it refers to should be called "PRTX", where "X" is
|
|
||||||
the number of the port. An example of such a package would be:
|
|
||||||
|
|
||||||
Package() { "port@4", PRT4 }
|
|
||||||
|
|
||||||
Further on, endpoints are located under the port nodes. The hierarchical
|
|
||||||
data extension key of the endpoint nodes must begin with
|
|
||||||
"endpoint" and must be followed by the "@" character and the number of the
|
|
||||||
endpoint. The object it refers to should be called "EPXY", where "X" is the
|
|
||||||
number of the port and "Y" is the number of the endpoint. An example of such a
|
|
||||||
package would be:
|
|
||||||
|
|
||||||
Package() { "endpoint@0", EP40 }
|
|
||||||
|
|
||||||
Each port node contains a property extension key "port", the value of which is
|
|
||||||
the number of the port. Each endpoint is similarly numbered with a property
|
|
||||||
extension key "reg", the value of which is the number of the endpoint. Port
|
|
||||||
numbers must be unique within a device and endpoint numbers must be unique
|
|
||||||
within a port. If a device object may only has a single port, then the number
|
|
||||||
of that port shall be zero. Similarly, if a port may only have a single
|
|
||||||
endpoint, the number of that endpoint shall be zero.
|
|
||||||
|
|
||||||
The endpoint reference uses property extension with "remote-endpoint" property
|
|
||||||
name followed by a reference in the same package. Such references consist of the
|
|
||||||
the remote device reference, the first package entry of the port data extension
|
|
||||||
reference under the device and finally the first package entry of the endpoint
|
|
||||||
data extension reference under the port. Individual references thus appear as:
|
|
||||||
|
|
||||||
Package() { device, "port@X", "endpoint@Y" }
|
|
||||||
|
|
||||||
In the above example, "X" is the number of the port and "Y" is the number of the
|
|
||||||
endpoint.
|
|
||||||
|
|
||||||
The references to endpoints must be always done both ways, to the
|
|
||||||
remote endpoint and back from the referred remote endpoint node.
|
|
||||||
|
|
||||||
A simple example of this is show below:
|
|
||||||
|
|
||||||
Scope (\_SB.PCI0.I2C2)
|
|
||||||
{
|
|
||||||
Device (CAM0)
|
|
||||||
{
|
|
||||||
Name (_DSD, Package () {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "compatible", Package () { "nokia,smia" } },
|
|
||||||
},
|
|
||||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
|
||||||
Package () {
|
|
||||||
Package () { "port@0", PRT0 },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Name (PRT0, Package() {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "reg", 0 },
|
|
||||||
},
|
|
||||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
|
||||||
Package () {
|
|
||||||
Package () { "endpoint@0", EP00 },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Name (EP00, Package() {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "reg", 0 },
|
|
||||||
Package () { "remote-endpoint", Package() { \_SB.PCI0.ISP, "port@4", "endpoint@0" } },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Scope (\_SB.PCI0)
|
|
||||||
{
|
|
||||||
Device (ISP)
|
|
||||||
{
|
|
||||||
Name (_DSD, Package () {
|
|
||||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
|
||||||
Package () {
|
|
||||||
Package () { "port@4", PRT4 },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
Name (PRT4, Package() {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "reg", 4 }, /* CSI-2 port number */
|
|
||||||
},
|
|
||||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
|
||||||
Package () {
|
|
||||||
Package () { "endpoint@0", EP40 },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
Name (EP40, Package() {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () { "reg", 0 },
|
|
||||||
Package () { "remote-endpoint", Package () { \_SB.PCI0.I2C2.CAM0, "port@0", "endpoint@0" } },
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Here, the port 0 of the "CAM0" device is connected to the port 4 of
|
|
||||||
the "ISP" device and vice versa.
|
|
||||||
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
|
|
||||||
[1] _DSD (Device Specific Data) Implementation Guide.
|
|
||||||
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-implementation-guide-toplevel-1_1.htm>,
|
|
||||||
referenced 2016-10-03.
|
|
||||||
|
|
||||||
[2] Devicetree. <URL:http://www.devicetree.org>, referenced 2016-10-03.
|
|
||||||
|
|
||||||
[3] Documentation/devicetree/bindings/graph.txt
|
|
||||||
|
|
||||||
[4] Device Properties UUID For _DSD.
|
|
||||||
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
|
|
||||||
referenced 2016-10-04.
|
|
||||||
|
|
||||||
[5] Hierarchical Data Extension UUID For _DSD.
|
|
||||||
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
|
|
||||||
referenced 2016-10-04.
|
|
||||||
|
|
||||||
[6] Advanced Configuration and Power Interface Specification.
|
|
||||||
<URL:http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf>,
|
|
||||||
referenced 2016-10-04.
|
|
||||||
|
|
||||||
[7] _DSD Device Properties Usage Rules.
|
|
||||||
Documentation/acpi/DSD-properties-rules.txt
|
|
||||||
99
Documentation/acpi/dsd/leds.txt
Normal file
99
Documentation/acpi/dsd/leds.txt
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
Describing and referring to LEDs in ACPI
|
||||||
|
|
||||||
|
Individual LEDs are described by hierarchical data extension [6] nodes under the
|
||||||
|
device node, the LED driver chip. The "reg" property in the LED specific nodes
|
||||||
|
tells the numerical ID of each individual LED output to which the LEDs are
|
||||||
|
connected. [3] The hierarchical data nodes are named "led@X", where X is the
|
||||||
|
number of the LED output.
|
||||||
|
|
||||||
|
Referring to LEDs in Device tree is documented in [4], in "flash-leds" property
|
||||||
|
documentation. In short, LEDs are directly referred to by using phandles.
|
||||||
|
|
||||||
|
While Device tree allows referring to any node in the tree[1], in ACPI
|
||||||
|
references are limited to device nodes only [2]. For this reason using the same
|
||||||
|
mechanism on ACPI is not possible. A mechanism to refer to non-device ACPI nodes
|
||||||
|
is documented in [7].
|
||||||
|
|
||||||
|
ACPI allows (as does DT) using integer arguments after the reference. A
|
||||||
|
combination of the LED driver device reference and an integer argument,
|
||||||
|
referring to the "reg" property of the relevant LED, is used to identify
|
||||||
|
individual LEDs. The value of the "reg" property is a contract between the
|
||||||
|
firmware and software, it uniquely identifies the LED driver outputs.
|
||||||
|
|
||||||
|
Under the LED driver device, The first hierarchical data extension package list
|
||||||
|
entry shall contain the string "led@" followed by the number of the LED,
|
||||||
|
followed by the referred object name. That object shall be named "LED" followed
|
||||||
|
by the number of the LED.
|
||||||
|
|
||||||
|
An ASL example of a camera sensor device and a LED driver device for two LEDs.
|
||||||
|
Objects not relevant for LEDs or the references to them have been omitted.
|
||||||
|
|
||||||
|
Device (LED)
|
||||||
|
{
|
||||||
|
Name (_DSD, Package () {
|
||||||
|
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
||||||
|
Package () {
|
||||||
|
Package () { "led@0", LED0 },
|
||||||
|
Package () { "led@1", LED1 },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Name (LED0, Package () {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "reg", 0 },
|
||||||
|
Package () { "flash-max-microamp", 1000000 },
|
||||||
|
Package () { "flash-timeout-us", 200000 },
|
||||||
|
Package () { "led-max-microamp", 100000 },
|
||||||
|
Package () { "label", "white:flash" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Name (LED1, Package () {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () { "reg", 1 },
|
||||||
|
Package () { "led-max-microamp", 10000 },
|
||||||
|
Package () { "label", "red:indicator" },
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
Device (SEN)
|
||||||
|
{
|
||||||
|
Name (_DSD, Package () {
|
||||||
|
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
||||||
|
Package () {
|
||||||
|
Package () {
|
||||||
|
"flash-leds",
|
||||||
|
Package () { ^LED, "led@0", ^LED, "led@1" },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
where
|
||||||
|
|
||||||
|
LED LED driver device
|
||||||
|
LED0 First LED
|
||||||
|
LED1 Second LED
|
||||||
|
SEN Camera sensor device (or another device the LED is
|
||||||
|
related to)
|
||||||
|
|
||||||
|
[1] Device tree. <URL:http://www.devicetree.org>, referenced 2019-02-21.
|
||||||
|
|
||||||
|
[2] Advanced Configuration and Power Interface Specification.
|
||||||
|
<URL:https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf>,
|
||||||
|
referenced 2019-02-21.
|
||||||
|
|
||||||
|
[3] Documentation/devicetree/bindings/leds/common.txt
|
||||||
|
|
||||||
|
[4] Documentation/devicetree/bindings/media/video-interfaces.txt
|
||||||
|
|
||||||
|
[5] Device Properties UUID For _DSD.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf>,
|
||||||
|
referenced 2019-02-21.
|
||||||
|
|
||||||
|
[6] Hierarchical Data Extension UUID For _DSD.
|
||||||
|
<URL:http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>,
|
||||||
|
referenced 2019-02-21.
|
||||||
|
|
||||||
|
[7] Documentation/acpi/dsd/data-node-reference.txt
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
Linux supports a method of overriding the BIOS DSDT:
|
|
||||||
|
|
||||||
CONFIG_ACPI_CUSTOM_DSDT builds the image into the kernel.
|
|
||||||
|
|
||||||
When to use this method is described in detail on the
|
|
||||||
Linux/ACPI home page:
|
|
||||||
https://01.org/linux-acpi/documentation/overriding-dsdt
|
|
||||||
@@ -1,426 +0,0 @@
|
|||||||
ACPI based device enumeration
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
ACPI 5 introduced a set of new resources (UartTSerialBus, I2cSerialBus,
|
|
||||||
SpiSerialBus, GpioIo and GpioInt) which can be used in enumerating slave
|
|
||||||
devices behind serial bus controllers.
|
|
||||||
|
|
||||||
In addition we are starting to see peripherals integrated in the
|
|
||||||
SoC/Chipset to appear only in ACPI namespace. These are typically devices
|
|
||||||
that are accessed through memory-mapped registers.
|
|
||||||
|
|
||||||
In order to support this and re-use the existing drivers as much as
|
|
||||||
possible we decided to do following:
|
|
||||||
|
|
||||||
o Devices that have no bus connector resource are represented as
|
|
||||||
platform devices.
|
|
||||||
|
|
||||||
o Devices behind real busses where there is a connector resource
|
|
||||||
are represented as struct spi_device or struct i2c_device
|
|
||||||
(standard UARTs are not busses so there is no struct uart_device).
|
|
||||||
|
|
||||||
As both ACPI and Device Tree represent a tree of devices (and their
|
|
||||||
resources) this implementation follows the Device Tree way as much as
|
|
||||||
possible.
|
|
||||||
|
|
||||||
The ACPI implementation enumerates devices behind busses (platform, SPI and
|
|
||||||
I2C), creates the physical devices and binds them to their ACPI handle in
|
|
||||||
the ACPI namespace.
|
|
||||||
|
|
||||||
This means that when ACPI_HANDLE(dev) returns non-NULL the device was
|
|
||||||
enumerated from ACPI namespace. This handle can be used to extract other
|
|
||||||
device-specific configuration. There is an example of this below.
|
|
||||||
|
|
||||||
Platform bus support
|
|
||||||
~~~~~~~~~~~~~~~~~~~~
|
|
||||||
Since we are using platform devices to represent devices that are not
|
|
||||||
connected to any physical bus we only need to implement a platform driver
|
|
||||||
for the device and add supported ACPI IDs. If this same IP-block is used on
|
|
||||||
some other non-ACPI platform, the driver might work out of the box or needs
|
|
||||||
some minor changes.
|
|
||||||
|
|
||||||
Adding ACPI support for an existing driver should be pretty
|
|
||||||
straightforward. Here is the simplest example:
|
|
||||||
|
|
||||||
#ifdef CONFIG_ACPI
|
|
||||||
static const struct acpi_device_id mydrv_acpi_match[] = {
|
|
||||||
/* ACPI IDs here */
|
|
||||||
{ }
|
|
||||||
};
|
|
||||||
MODULE_DEVICE_TABLE(acpi, mydrv_acpi_match);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static struct platform_driver my_driver = {
|
|
||||||
...
|
|
||||||
.driver = {
|
|
||||||
.acpi_match_table = ACPI_PTR(mydrv_acpi_match),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
If the driver needs to perform more complex initialization like getting and
|
|
||||||
configuring GPIOs it can get its ACPI handle and extract this information
|
|
||||||
from ACPI tables.
|
|
||||||
|
|
||||||
DMA support
|
|
||||||
~~~~~~~~~~~
|
|
||||||
DMA controllers enumerated via ACPI should be registered in the system to
|
|
||||||
provide generic access to their resources. For example, a driver that would
|
|
||||||
like to be accessible to slave devices via generic API call
|
|
||||||
dma_request_slave_channel() must register itself at the end of the probe
|
|
||||||
function like this:
|
|
||||||
|
|
||||||
err = devm_acpi_dma_controller_register(dev, xlate_func, dw);
|
|
||||||
/* Handle the error if it's not a case of !CONFIG_ACPI */
|
|
||||||
|
|
||||||
and implement custom xlate function if needed (usually acpi_dma_simple_xlate()
|
|
||||||
is enough) which converts the FixedDMA resource provided by struct
|
|
||||||
acpi_dma_spec into the corresponding DMA channel. A piece of code for that case
|
|
||||||
could look like:
|
|
||||||
|
|
||||||
#ifdef CONFIG_ACPI
|
|
||||||
struct filter_args {
|
|
||||||
/* Provide necessary information for the filter_func */
|
|
||||||
...
|
|
||||||
};
|
|
||||||
|
|
||||||
static bool filter_func(struct dma_chan *chan, void *param)
|
|
||||||
{
|
|
||||||
/* Choose the proper channel */
|
|
||||||
...
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct dma_chan *xlate_func(struct acpi_dma_spec *dma_spec,
|
|
||||||
struct acpi_dma *adma)
|
|
||||||
{
|
|
||||||
dma_cap_mask_t cap;
|
|
||||||
struct filter_args args;
|
|
||||||
|
|
||||||
/* Prepare arguments for filter_func */
|
|
||||||
...
|
|
||||||
return dma_request_channel(cap, filter_func, &args);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static struct dma_chan *xlate_func(struct acpi_dma_spec *dma_spec,
|
|
||||||
struct acpi_dma *adma)
|
|
||||||
{
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
dma_request_slave_channel() will call xlate_func() for each registered DMA
|
|
||||||
controller. In the xlate function the proper channel must be chosen based on
|
|
||||||
information in struct acpi_dma_spec and the properties of the controller
|
|
||||||
provided by struct acpi_dma.
|
|
||||||
|
|
||||||
Clients must call dma_request_slave_channel() with the string parameter that
|
|
||||||
corresponds to a specific FixedDMA resource. By default "tx" means the first
|
|
||||||
entry of the FixedDMA resource array, "rx" means the second entry. The table
|
|
||||||
below shows a layout:
|
|
||||||
|
|
||||||
Device (I2C0)
|
|
||||||
{
|
|
||||||
...
|
|
||||||
Method (_CRS, 0, NotSerialized)
|
|
||||||
{
|
|
||||||
Name (DBUF, ResourceTemplate ()
|
|
||||||
{
|
|
||||||
FixedDMA (0x0018, 0x0004, Width32bit, _Y48)
|
|
||||||
FixedDMA (0x0019, 0x0005, Width32bit, )
|
|
||||||
})
|
|
||||||
...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
So, the FixedDMA with request line 0x0018 is "tx" and next one is "rx" in
|
|
||||||
this example.
|
|
||||||
|
|
||||||
In robust cases the client unfortunately needs to call
|
|
||||||
acpi_dma_request_slave_chan_by_index() directly and therefore choose the
|
|
||||||
specific FixedDMA resource by its index.
|
|
||||||
|
|
||||||
SPI serial bus support
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
Slave devices behind SPI bus have SpiSerialBus resource attached to them.
|
|
||||||
This is extracted automatically by the SPI core and the slave devices are
|
|
||||||
enumerated once spi_register_master() is called by the bus driver.
|
|
||||||
|
|
||||||
Here is what the ACPI namespace for a SPI slave might look like:
|
|
||||||
|
|
||||||
Device (EEP0)
|
|
||||||
{
|
|
||||||
Name (_ADR, 1)
|
|
||||||
Name (_CID, Package() {
|
|
||||||
"ATML0025",
|
|
||||||
"AT25",
|
|
||||||
})
|
|
||||||
...
|
|
||||||
Method (_CRS, 0, NotSerialized)
|
|
||||||
{
|
|
||||||
SPISerialBus(1, PolarityLow, FourWireMode, 8,
|
|
||||||
ControllerInitiated, 1000000, ClockPolarityLow,
|
|
||||||
ClockPhaseFirst, "\\_SB.PCI0.SPI1",)
|
|
||||||
}
|
|
||||||
...
|
|
||||||
|
|
||||||
The SPI device drivers only need to add ACPI IDs in a similar way than with
|
|
||||||
the platform device drivers. Below is an example where we add ACPI support
|
|
||||||
to at25 SPI eeprom driver (this is meant for the above ACPI snippet):
|
|
||||||
|
|
||||||
#ifdef CONFIG_ACPI
|
|
||||||
static const struct acpi_device_id at25_acpi_match[] = {
|
|
||||||
{ "AT25", 0 },
|
|
||||||
{ },
|
|
||||||
};
|
|
||||||
MODULE_DEVICE_TABLE(acpi, at25_acpi_match);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static struct spi_driver at25_driver = {
|
|
||||||
.driver = {
|
|
||||||
...
|
|
||||||
.acpi_match_table = ACPI_PTR(at25_acpi_match),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
Note that this driver actually needs more information like page size of the
|
|
||||||
eeprom etc. but at the time writing this there is no standard way of
|
|
||||||
passing those. One idea is to return this in _DSM method like:
|
|
||||||
|
|
||||||
Device (EEP0)
|
|
||||||
{
|
|
||||||
...
|
|
||||||
Method (_DSM, 4, NotSerialized)
|
|
||||||
{
|
|
||||||
Store (Package (6)
|
|
||||||
{
|
|
||||||
"byte-len", 1024,
|
|
||||||
"addr-mode", 2,
|
|
||||||
"page-size, 32
|
|
||||||
}, Local0)
|
|
||||||
|
|
||||||
// Check UUIDs etc.
|
|
||||||
|
|
||||||
Return (Local0)
|
|
||||||
}
|
|
||||||
|
|
||||||
Then the at25 SPI driver can get this configuration by calling _DSM on its
|
|
||||||
ACPI handle like:
|
|
||||||
|
|
||||||
struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL };
|
|
||||||
struct acpi_object_list input;
|
|
||||||
acpi_status status;
|
|
||||||
|
|
||||||
/* Fill in the input buffer */
|
|
||||||
|
|
||||||
status = acpi_evaluate_object(ACPI_HANDLE(&spi->dev), "_DSM",
|
|
||||||
&input, &output);
|
|
||||||
if (ACPI_FAILURE(status))
|
|
||||||
/* Handle the error */
|
|
||||||
|
|
||||||
/* Extract the data here */
|
|
||||||
|
|
||||||
kfree(output.pointer);
|
|
||||||
|
|
||||||
I2C serial bus support
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
The slaves behind I2C bus controller only need to add the ACPI IDs like
|
|
||||||
with the platform and SPI drivers. The I2C core automatically enumerates
|
|
||||||
any slave devices behind the controller device once the adapter is
|
|
||||||
registered.
|
|
||||||
|
|
||||||
Below is an example of how to add ACPI support to the existing mpu3050
|
|
||||||
input driver:
|
|
||||||
|
|
||||||
#ifdef CONFIG_ACPI
|
|
||||||
static const struct acpi_device_id mpu3050_acpi_match[] = {
|
|
||||||
{ "MPU3050", 0 },
|
|
||||||
{ },
|
|
||||||
};
|
|
||||||
MODULE_DEVICE_TABLE(acpi, mpu3050_acpi_match);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static struct i2c_driver mpu3050_i2c_driver = {
|
|
||||||
.driver = {
|
|
||||||
.name = "mpu3050",
|
|
||||||
.owner = THIS_MODULE,
|
|
||||||
.pm = &mpu3050_pm,
|
|
||||||
.of_match_table = mpu3050_of_match,
|
|
||||||
.acpi_match_table = ACPI_PTR(mpu3050_acpi_match),
|
|
||||||
},
|
|
||||||
.probe = mpu3050_probe,
|
|
||||||
.remove = mpu3050_remove,
|
|
||||||
.id_table = mpu3050_ids,
|
|
||||||
};
|
|
||||||
|
|
||||||
GPIO support
|
|
||||||
~~~~~~~~~~~~
|
|
||||||
ACPI 5 introduced two new resources to describe GPIO connections: GpioIo
|
|
||||||
and GpioInt. These resources can be used to pass GPIO numbers used by
|
|
||||||
the device to the driver. ACPI 5.1 extended this with _DSD (Device
|
|
||||||
Specific Data) which made it possible to name the GPIOs among other things.
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
Device (DEV)
|
|
||||||
{
|
|
||||||
Method (_CRS, 0, NotSerialized)
|
|
||||||
{
|
|
||||||
Name (SBUF, ResourceTemplate()
|
|
||||||
{
|
|
||||||
...
|
|
||||||
// Used to power on/off the device
|
|
||||||
GpioIo (Exclusive, PullDefault, 0x0000, 0x0000,
|
|
||||||
IoRestrictionOutputOnly, "\\_SB.PCI0.GPI0",
|
|
||||||
0x00, ResourceConsumer,,)
|
|
||||||
{
|
|
||||||
// Pin List
|
|
||||||
0x0055
|
|
||||||
}
|
|
||||||
|
|
||||||
// Interrupt for the device
|
|
||||||
GpioInt (Edge, ActiveHigh, ExclusiveAndWake, PullNone,
|
|
||||||
0x0000, "\\_SB.PCI0.GPI0", 0x00, ResourceConsumer,,)
|
|
||||||
{
|
|
||||||
// Pin list
|
|
||||||
0x0058
|
|
||||||
}
|
|
||||||
|
|
||||||
...
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
Return (SBUF)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ACPI 5.1 _DSD used for naming the GPIOs
|
|
||||||
Name (_DSD, Package ()
|
|
||||||
{
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package ()
|
|
||||||
{
|
|
||||||
Package () {"power-gpios", Package() {^DEV, 0, 0, 0 }},
|
|
||||||
Package () {"irq-gpios", Package() {^DEV, 1, 0, 0 }},
|
|
||||||
}
|
|
||||||
})
|
|
||||||
...
|
|
||||||
|
|
||||||
These GPIO numbers are controller relative and path "\\_SB.PCI0.GPI0"
|
|
||||||
specifies the path to the controller. In order to use these GPIOs in Linux
|
|
||||||
we need to translate them to the corresponding Linux GPIO descriptors.
|
|
||||||
|
|
||||||
There is a standard GPIO API for that and is documented in
|
|
||||||
Documentation/gpio/.
|
|
||||||
|
|
||||||
In the above example we can get the corresponding two GPIO descriptors with
|
|
||||||
a code like this:
|
|
||||||
|
|
||||||
#include <linux/gpio/consumer.h>
|
|
||||||
...
|
|
||||||
|
|
||||||
struct gpio_desc *irq_desc, *power_desc;
|
|
||||||
|
|
||||||
irq_desc = gpiod_get(dev, "irq");
|
|
||||||
if (IS_ERR(irq_desc))
|
|
||||||
/* handle error */
|
|
||||||
|
|
||||||
power_desc = gpiod_get(dev, "power");
|
|
||||||
if (IS_ERR(power_desc))
|
|
||||||
/* handle error */
|
|
||||||
|
|
||||||
/* Now we can use the GPIO descriptors */
|
|
||||||
|
|
||||||
There are also devm_* versions of these functions which release the
|
|
||||||
descriptors once the device is released.
|
|
||||||
|
|
||||||
See Documentation/acpi/gpio-properties.txt for more information about the
|
|
||||||
_DSD binding related to GPIOs.
|
|
||||||
|
|
||||||
MFD devices
|
|
||||||
~~~~~~~~~~~
|
|
||||||
The MFD devices register their children as platform devices. For the child
|
|
||||||
devices there needs to be an ACPI handle that they can use to reference
|
|
||||||
parts of the ACPI namespace that relate to them. In the Linux MFD subsystem
|
|
||||||
we provide two ways:
|
|
||||||
|
|
||||||
o The children share the parent ACPI handle.
|
|
||||||
o The MFD cell can specify the ACPI id of the device.
|
|
||||||
|
|
||||||
For the first case, the MFD drivers do not need to do anything. The
|
|
||||||
resulting child platform device will have its ACPI_COMPANION() set to point
|
|
||||||
to the parent device.
|
|
||||||
|
|
||||||
If the ACPI namespace has a device that we can match using an ACPI id or ACPI
|
|
||||||
adr, the cell should be set like:
|
|
||||||
|
|
||||||
static struct mfd_cell_acpi_match my_subdevice_cell_acpi_match = {
|
|
||||||
.pnpid = "XYZ0001",
|
|
||||||
.adr = 0,
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct mfd_cell my_subdevice_cell = {
|
|
||||||
.name = "my_subdevice",
|
|
||||||
/* set the resources relative to the parent */
|
|
||||||
.acpi_match = &my_subdevice_cell_acpi_match,
|
|
||||||
};
|
|
||||||
|
|
||||||
The ACPI id "XYZ0001" is then used to lookup an ACPI device directly under
|
|
||||||
the MFD device and if found, that ACPI companion device is bound to the
|
|
||||||
resulting child platform device.
|
|
||||||
|
|
||||||
Device Tree namespace link device ID
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
The Device Tree protocol uses device identification based on the "compatible"
|
|
||||||
property whose value is a string or an array of strings recognized as device
|
|
||||||
identifiers by drivers and the driver core. The set of all those strings may be
|
|
||||||
regarded as a device identification namespace analogous to the ACPI/PNP device
|
|
||||||
ID namespace. Consequently, in principle it should not be necessary to allocate
|
|
||||||
a new (and arguably redundant) ACPI/PNP device ID for a devices with an existing
|
|
||||||
identification string in the Device Tree (DT) namespace, especially if that ID
|
|
||||||
is only needed to indicate that a given device is compatible with another one,
|
|
||||||
presumably having a matching driver in the kernel already.
|
|
||||||
|
|
||||||
In ACPI, the device identification object called _CID (Compatible ID) is used to
|
|
||||||
list the IDs of devices the given one is compatible with, but those IDs must
|
|
||||||
belong to one of the namespaces prescribed by the ACPI specification (see
|
|
||||||
Section 6.1.2 of ACPI 6.0 for details) and the DT namespace is not one of them.
|
|
||||||
Moreover, the specification mandates that either a _HID or an _ADR identification
|
|
||||||
object be present for all ACPI objects representing devices (Section 6.1 of ACPI
|
|
||||||
6.0). For non-enumerable bus types that object must be _HID and its value must
|
|
||||||
be a device ID from one of the namespaces prescribed by the specification too.
|
|
||||||
|
|
||||||
The special DT namespace link device ID, PRP0001, provides a means to use the
|
|
||||||
existing DT-compatible device identification in ACPI and to satisfy the above
|
|
||||||
requirements following from the ACPI specification at the same time. Namely,
|
|
||||||
if PRP0001 is returned by _HID, the ACPI subsystem will look for the
|
|
||||||
"compatible" property in the device object's _DSD and will use the value of that
|
|
||||||
property to identify the corresponding device in analogy with the original DT
|
|
||||||
device identification algorithm. If the "compatible" property is not present
|
|
||||||
or its value is not valid, the device will not be enumerated by the ACPI
|
|
||||||
subsystem. Otherwise, it will be enumerated automatically as a platform device
|
|
||||||
(except when an I2C or SPI link from the device to its parent is present, in
|
|
||||||
which case the ACPI core will leave the device enumeration to the parent's
|
|
||||||
driver) and the identification strings from the "compatible" property value will
|
|
||||||
be used to find a driver for the device along with the device IDs listed by _CID
|
|
||||||
(if present).
|
|
||||||
|
|
||||||
Analogously, if PRP0001 is present in the list of device IDs returned by _CID,
|
|
||||||
the identification strings listed by the "compatible" property value (if present
|
|
||||||
and valid) will be used to look for a driver matching the device, but in that
|
|
||||||
case their relative priority with respect to the other device IDs listed by
|
|
||||||
_HID and _CID depends on the position of PRP0001 in the _CID return package.
|
|
||||||
Specifically, the device IDs returned by _HID and preceding PRP0001 in the _CID
|
|
||||||
return package will be checked first. Also in that case the bus type the device
|
|
||||||
will be enumerated to depends on the device ID returned by _HID.
|
|
||||||
|
|
||||||
It is valid to define device objects with a _HID returning PRP0001 and without
|
|
||||||
the "compatible" property in the _DSD or a _CID as long as one of their
|
|
||||||
ancestors provides a _DSD with a valid "compatible" property. Such device
|
|
||||||
objects are then simply regarded as additional "blocks" providing hierarchical
|
|
||||||
configuration information to the driver of the composite ancestor device.
|
|
||||||
|
|
||||||
However, PRP0001 can only be returned from either _HID or _CID of a device
|
|
||||||
object if all of the properties returned by the _DSD associated with it (either
|
|
||||||
the _DSD of the device object itself or the _DSD of its ancestor in the
|
|
||||||
"composite device" case described above) can be used in the ACPI environment.
|
|
||||||
Otherwise, the _DSD itself is regarded as invalid and therefore the "compatible"
|
|
||||||
property returned by it is meaningless.
|
|
||||||
|
|
||||||
Refer to DSD-properties-rules.txt for more information.
|
|
||||||
@@ -1,223 +0,0 @@
|
|||||||
_DSD Device Properties Related to GPIO
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
With the release of ACPI 5.1, the _DSD configuration object finally
|
|
||||||
allows names to be given to GPIOs (and other things as well) returned
|
|
||||||
by _CRS. Previously, we were only able to use an integer index to find
|
|
||||||
the corresponding GPIO, which is pretty error prone (it depends on
|
|
||||||
the _CRS output ordering, for example).
|
|
||||||
|
|
||||||
With _DSD we can now query GPIOs using a name instead of an integer
|
|
||||||
index, like the ASL example below shows:
|
|
||||||
|
|
||||||
// Bluetooth device with reset and shutdown GPIOs
|
|
||||||
Device (BTH)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
|
|
||||||
Name (_CRS, ResourceTemplate ()
|
|
||||||
{
|
|
||||||
GpioIo (Exclusive, PullUp, 0, 0, IoRestrictionInputOnly,
|
|
||||||
"\\_SB.GPO0", 0, ResourceConsumer) {15}
|
|
||||||
GpioIo (Exclusive, PullUp, 0, 0, IoRestrictionInputOnly,
|
|
||||||
"\\_SB.GPO0", 0, ResourceConsumer) {27, 31}
|
|
||||||
})
|
|
||||||
|
|
||||||
Name (_DSD, Package ()
|
|
||||||
{
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package ()
|
|
||||||
{
|
|
||||||
Package () {"reset-gpios", Package() {^BTH, 1, 1, 0 }},
|
|
||||||
Package () {"shutdown-gpios", Package() {^BTH, 0, 0, 0 }},
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
The format of the supported GPIO property is:
|
|
||||||
|
|
||||||
Package () { "name", Package () { ref, index, pin, active_low }}
|
|
||||||
|
|
||||||
ref - The device that has _CRS containing GpioIo()/GpioInt() resources,
|
|
||||||
typically this is the device itself (BTH in our case).
|
|
||||||
index - Index of the GpioIo()/GpioInt() resource in _CRS starting from zero.
|
|
||||||
pin - Pin in the GpioIo()/GpioInt() resource. Typically this is zero.
|
|
||||||
active_low - If 1 the GPIO is marked as active_low.
|
|
||||||
|
|
||||||
Since ACPI GpioIo() resource does not have a field saying whether it is
|
|
||||||
active low or high, the "active_low" argument can be used here. Setting
|
|
||||||
it to 1 marks the GPIO as active low.
|
|
||||||
|
|
||||||
In our Bluetooth example the "reset-gpios" refers to the second GpioIo()
|
|
||||||
resource, second pin in that resource with the GPIO number of 31.
|
|
||||||
|
|
||||||
It is possible to leave holes in the array of GPIOs. This is useful in
|
|
||||||
cases like with SPI host controllers where some chip selects may be
|
|
||||||
implemented as GPIOs and some as native signals. For example a SPI host
|
|
||||||
controller can have chip selects 0 and 2 implemented as GPIOs and 1 as
|
|
||||||
native:
|
|
||||||
|
|
||||||
Package () {
|
|
||||||
"cs-gpios",
|
|
||||||
Package () {
|
|
||||||
^GPIO, 19, 0, 0, // chip select 0: GPIO
|
|
||||||
0, // chip select 1: native signal
|
|
||||||
^GPIO, 20, 0, 0, // chip select 2: GPIO
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Other supported properties
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
Following Device Tree compatible device properties are also supported by
|
|
||||||
_DSD device properties for GPIO controllers:
|
|
||||||
|
|
||||||
- gpio-hog
|
|
||||||
- output-high
|
|
||||||
- output-low
|
|
||||||
- input
|
|
||||||
- line-name
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
Name (_DSD, Package () {
|
|
||||||
// _DSD Hierarchical Properties Extension UUID
|
|
||||||
ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"),
|
|
||||||
Package () {
|
|
||||||
Package () {"hog-gpio8", "G8PU"}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
Name (G8PU, Package () {
|
|
||||||
ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"),
|
|
||||||
Package () {
|
|
||||||
Package () {"gpio-hog", 1},
|
|
||||||
Package () {"gpios", Package () {8, 0}},
|
|
||||||
Package () {"output-high", 1},
|
|
||||||
Package () {"line-name", "gpio8-pullup"},
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
- gpio-line-names
|
|
||||||
|
|
||||||
Example:
|
|
||||||
|
|
||||||
Package () {
|
|
||||||
"gpio-line-names",
|
|
||||||
Package () {
|
|
||||||
"SPI0_CS_N", "EXP2_INT", "MUX6_IO", "UART0_RXD", "MUX7_IO",
|
|
||||||
"LVL_C_A1", "MUX0_IO", "SPI1_MISO"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
See Documentation/devicetree/bindings/gpio/gpio.txt for more information
|
|
||||||
about these properties.
|
|
||||||
|
|
||||||
ACPI GPIO Mappings Provided by Drivers
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
There are systems in which the ACPI tables do not contain _DSD but provide _CRS
|
|
||||||
with GpioIo()/GpioInt() resources and device drivers still need to work with
|
|
||||||
them.
|
|
||||||
|
|
||||||
In those cases ACPI device identification objects, _HID, _CID, _CLS, _SUB, _HRV,
|
|
||||||
available to the driver can be used to identify the device and that is supposed
|
|
||||||
to be sufficient to determine the meaning and purpose of all of the GPIO lines
|
|
||||||
listed by the GpioIo()/GpioInt() resources returned by _CRS. In other words,
|
|
||||||
the driver is supposed to know what to use the GpioIo()/GpioInt() resources for
|
|
||||||
once it has identified the device. Having done that, it can simply assign names
|
|
||||||
to the GPIO lines it is going to use and provide the GPIO subsystem with a
|
|
||||||
mapping between those names and the ACPI GPIO resources corresponding to them.
|
|
||||||
|
|
||||||
To do that, the driver needs to define a mapping table as a NULL-terminated
|
|
||||||
array of struct acpi_gpio_mapping objects that each contain a name, a pointer
|
|
||||||
to an array of line data (struct acpi_gpio_params) objects and the size of that
|
|
||||||
array. Each struct acpi_gpio_params object consists of three fields,
|
|
||||||
crs_entry_index, line_index, active_low, representing the index of the target
|
|
||||||
GpioIo()/GpioInt() resource in _CRS starting from zero, the index of the target
|
|
||||||
line in that resource starting from zero, and the active-low flag for that line,
|
|
||||||
respectively, in analogy with the _DSD GPIO property format specified above.
|
|
||||||
|
|
||||||
For the example Bluetooth device discussed previously the data structures in
|
|
||||||
question would look like this:
|
|
||||||
|
|
||||||
static const struct acpi_gpio_params reset_gpio = { 1, 1, false };
|
|
||||||
static const struct acpi_gpio_params shutdown_gpio = { 0, 0, false };
|
|
||||||
|
|
||||||
static const struct acpi_gpio_mapping bluetooth_acpi_gpios[] = {
|
|
||||||
{ "reset-gpios", &reset_gpio, 1 },
|
|
||||||
{ "shutdown-gpios", &shutdown_gpio, 1 },
|
|
||||||
{ },
|
|
||||||
};
|
|
||||||
|
|
||||||
Next, the mapping table needs to be passed as the second argument to
|
|
||||||
acpi_dev_add_driver_gpios() that will register it with the ACPI device object
|
|
||||||
pointed to by its first argument. That should be done in the driver's .probe()
|
|
||||||
routine. On removal, the driver should unregister its GPIO mapping table by
|
|
||||||
calling acpi_dev_remove_driver_gpios() on the ACPI device object where that
|
|
||||||
table was previously registered.
|
|
||||||
|
|
||||||
Using the _CRS fallback
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
If a device does not have _DSD or the driver does not create ACPI GPIO
|
|
||||||
mapping, the Linux GPIO framework refuses to return any GPIOs. This is
|
|
||||||
because the driver does not know what it actually gets. For example if we
|
|
||||||
have a device like below:
|
|
||||||
|
|
||||||
Device (BTH)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
|
|
||||||
Name (_CRS, ResourceTemplate () {
|
|
||||||
GpioIo (Exclusive, PullNone, 0, 0, IoRestrictionNone,
|
|
||||||
"\\_SB.GPO0", 0, ResourceConsumer) {15}
|
|
||||||
GpioIo (Exclusive, PullNone, 0, 0, IoRestrictionNone,
|
|
||||||
"\\_SB.GPO0", 0, ResourceConsumer) {27}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
The driver might expect to get the right GPIO when it does:
|
|
||||||
|
|
||||||
desc = gpiod_get(dev, "reset", GPIOD_OUT_LOW);
|
|
||||||
|
|
||||||
but since there is no way to know the mapping between "reset" and
|
|
||||||
the GpioIo() in _CRS desc will hold ERR_PTR(-ENOENT).
|
|
||||||
|
|
||||||
The driver author can solve this by passing the mapping explictly
|
|
||||||
(the recommended way and documented in the above chapter).
|
|
||||||
|
|
||||||
The ACPI GPIO mapping tables should not contaminate drivers that are not
|
|
||||||
knowing about which exact device they are servicing on. It implies that
|
|
||||||
the ACPI GPIO mapping tables are hardly linked to ACPI ID and certain
|
|
||||||
objects, as listed in the above chapter, of the device in question.
|
|
||||||
|
|
||||||
Getting GPIO descriptor
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
There are two main approaches to get GPIO resource from ACPI:
|
|
||||||
desc = gpiod_get(dev, connection_id, flags);
|
|
||||||
desc = gpiod_get_index(dev, connection_id, index, flags);
|
|
||||||
|
|
||||||
We may consider two different cases here, i.e. when connection ID is
|
|
||||||
provided and otherwise.
|
|
||||||
|
|
||||||
Case 1:
|
|
||||||
desc = gpiod_get(dev, "non-null-connection-id", flags);
|
|
||||||
desc = gpiod_get_index(dev, "non-null-connection-id", index, flags);
|
|
||||||
|
|
||||||
Case 2:
|
|
||||||
desc = gpiod_get(dev, NULL, flags);
|
|
||||||
desc = gpiod_get_index(dev, NULL, index, flags);
|
|
||||||
|
|
||||||
Case 1 assumes that corresponding ACPI device description must have
|
|
||||||
defined device properties and will prevent to getting any GPIO resources
|
|
||||||
otherwise.
|
|
||||||
|
|
||||||
Case 2 explicitly tells GPIO core to look for resources in _CRS.
|
|
||||||
|
|
||||||
Be aware that gpiod_get_index() in cases 1 and 2, assuming that there
|
|
||||||
are two versions of ACPI device description provided and no mapping is
|
|
||||||
present in the driver, will return different resources. That's why a
|
|
||||||
certain driver has to handle them carefully as explained in previous
|
|
||||||
chapter.
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
ACPI I2C Muxes
|
|
||||||
--------------
|
|
||||||
|
|
||||||
Describing an I2C device hierarchy that includes I2C muxes requires an ACPI
|
|
||||||
Device () scope per mux channel.
|
|
||||||
|
|
||||||
Consider this topology:
|
|
||||||
|
|
||||||
+------+ +------+
|
|
||||||
| SMB1 |-->| MUX0 |--CH00--> i2c client A (0x50)
|
|
||||||
| | | 0x70 |--CH01--> i2c client B (0x50)
|
|
||||||
+------+ +------+
|
|
||||||
|
|
||||||
which corresponds to the following ASL:
|
|
||||||
|
|
||||||
Device (SMB1)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Device (MUX0)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Name (_CRS, ResourceTemplate () {
|
|
||||||
I2cSerialBus (0x70, ControllerInitiated, I2C_SPEED,
|
|
||||||
AddressingMode7Bit, "^SMB1", 0x00,
|
|
||||||
ResourceConsumer,,)
|
|
||||||
}
|
|
||||||
|
|
||||||
Device (CH00)
|
|
||||||
{
|
|
||||||
Name (_ADR, 0)
|
|
||||||
|
|
||||||
Device (CLIA)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Name (_CRS, ResourceTemplate () {
|
|
||||||
I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED,
|
|
||||||
AddressingMode7Bit, "^CH00", 0x00,
|
|
||||||
ResourceConsumer,,)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Device (CH01)
|
|
||||||
{
|
|
||||||
Name (_ADR, 1)
|
|
||||||
|
|
||||||
Device (CLIB)
|
|
||||||
{
|
|
||||||
Name (_HID, ...)
|
|
||||||
Name (_CRS, ResourceTemplate () {
|
|
||||||
I2cSerialBus (0x50, ControllerInitiated, I2C_SPEED,
|
|
||||||
AddressingMode7Bit, "^CH01", 0x00,
|
|
||||||
ResourceConsumer,,)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,111 +0,0 @@
|
|||||||
Upgrading ACPI tables via initrd
|
|
||||||
================================
|
|
||||||
|
|
||||||
1) Introduction (What is this about)
|
|
||||||
2) What is this for
|
|
||||||
3) How does it work
|
|
||||||
4) References (Where to retrieve userspace tools)
|
|
||||||
|
|
||||||
1) What is this about
|
|
||||||
---------------------
|
|
||||||
|
|
||||||
If the ACPI_TABLE_UPGRADE compile option is true, it is possible to
|
|
||||||
upgrade the ACPI execution environment that is defined by the ACPI tables
|
|
||||||
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
|
||||||
modified, more recent version one, or installing brand new ACPI tables.
|
|
||||||
|
|
||||||
When building initrd with kernel in a single image, option
|
|
||||||
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
|
||||||
feature to work.
|
|
||||||
|
|
||||||
For a full list of ACPI tables that can be upgraded/installed, take a look
|
|
||||||
at the char *table_sigs[MAX_ACPI_SIGNATURE]; definition in
|
|
||||||
drivers/acpi/tables.c.
|
|
||||||
All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should
|
|
||||||
be overridable, except:
|
|
||||||
- ACPI_SIG_RSDP (has a signature of 6 bytes)
|
|
||||||
- ACPI_SIG_FACS (does not have an ordinary ACPI table header)
|
|
||||||
Both could get implemented as well.
|
|
||||||
|
|
||||||
|
|
||||||
2) What is this for
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
Complain to your platform/BIOS vendor if you find a bug which is so severe
|
|
||||||
that a workaround is not accepted in the Linux kernel. And this facility
|
|
||||||
allows you to upgrade the buggy tables before your platform/BIOS vendor
|
|
||||||
releases an upgraded BIOS binary.
|
|
||||||
|
|
||||||
This facility can be used by platform/BIOS vendors to provide a Linux
|
|
||||||
compatible environment without modifying the underlying platform firmware.
|
|
||||||
|
|
||||||
This facility also provides a powerful feature to easily debug and test
|
|
||||||
ACPI BIOS table compatibility with the Linux kernel by modifying old
|
|
||||||
platform provided ACPI tables or inserting new ACPI tables.
|
|
||||||
|
|
||||||
It can and should be enabled in any kernel because there is no functional
|
|
||||||
change with not instrumented initrds.
|
|
||||||
|
|
||||||
|
|
||||||
3) How does it work
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
# Extract the machine's ACPI tables:
|
|
||||||
cd /tmp
|
|
||||||
acpidump >acpidump
|
|
||||||
acpixtract -a acpidump
|
|
||||||
# Disassemble, modify and recompile them:
|
|
||||||
iasl -d *.dat
|
|
||||||
# For example add this statement into a _PRT (PCI Routing Table) function
|
|
||||||
# of the DSDT:
|
|
||||||
Store("HELLO WORLD", debug)
|
|
||||||
# And increase the OEM Revision. For example, before modification:
|
|
||||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000)
|
|
||||||
# After modification:
|
|
||||||
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001)
|
|
||||||
iasl -sa dsdt.dsl
|
|
||||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
|
||||||
# They must be put into a /kernel/firmware/acpi directory inside the cpio
|
|
||||||
# archive. Note that if the table put here matches a platform table
|
|
||||||
# (similar Table Signature, and similar OEMID, and similar OEM Table ID)
|
|
||||||
# with a more recent OEM Revision, the platform table will be upgraded by
|
|
||||||
# this table. If the table put here doesn't match a platform table
|
|
||||||
# (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table
|
|
||||||
# ID), this table will be appended.
|
|
||||||
mkdir -p kernel/firmware/acpi
|
|
||||||
cp dsdt.aml kernel/firmware/acpi
|
|
||||||
# A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed
|
|
||||||
# (see osl.c):
|
|
||||||
iasl -sa facp.dsl
|
|
||||||
iasl -sa ssdt1.dsl
|
|
||||||
cp facp.aml kernel/firmware/acpi
|
|
||||||
cp ssdt1.aml kernel/firmware/acpi
|
|
||||||
# The uncompressed cpio archive must be the first. Other, typically
|
|
||||||
# compressed cpio archives, must be concatenated on top of the uncompressed
|
|
||||||
# one. Following command creates the uncompressed cpio archive and
|
|
||||||
# concatenates the original initrd on top:
|
|
||||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
|
||||||
cat /boot/initrd >>/boot/instrumented_initrd
|
|
||||||
# reboot with increased acpi debug level, e.g. boot params:
|
|
||||||
acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF
|
|
||||||
# and check your syslog:
|
|
||||||
[ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
|
|
||||||
[ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD"
|
|
||||||
|
|
||||||
iasl is able to disassemble and recompile quite a lot different,
|
|
||||||
also static ACPI tables.
|
|
||||||
|
|
||||||
|
|
||||||
4) Where to retrieve userspace tools
|
|
||||||
------------------------------------
|
|
||||||
|
|
||||||
iasl and acpixtract are part of Intel's ACPICA project:
|
|
||||||
http://acpica.org/
|
|
||||||
and should be packaged by distributions (for example in the acpica package
|
|
||||||
on SUSE).
|
|
||||||
|
|
||||||
acpidump can be found in Len Browns pmtools:
|
|
||||||
ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump
|
|
||||||
This tool is also part of the acpica package on SUSE.
|
|
||||||
Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels:
|
|
||||||
/sys/firmware/acpi/tables
|
|
||||||
@@ -1,262 +0,0 @@
|
|||||||
Linuxized ACPICA - Introduction to ACPICA Release Automation
|
|
||||||
|
|
||||||
Copyright (C) 2013-2016, Intel Corporation
|
|
||||||
Author: Lv Zheng <lv.zheng@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Abstract:
|
|
||||||
|
|
||||||
This document describes the ACPICA project and the relationship between
|
|
||||||
ACPICA and Linux. It also describes how ACPICA code in drivers/acpi/acpica,
|
|
||||||
include/acpi and tools/power/acpi is automatically updated to follow the
|
|
||||||
upstream.
|
|
||||||
|
|
||||||
|
|
||||||
1. ACPICA Project
|
|
||||||
|
|
||||||
The ACPI Component Architecture (ACPICA) project provides an operating
|
|
||||||
system (OS)-independent reference implementation of the Advanced
|
|
||||||
Configuration and Power Interface Specification (ACPI). It has been
|
|
||||||
adapted by various host OSes. By directly integrating ACPICA, Linux can
|
|
||||||
also benefit from the application experiences of ACPICA from other host
|
|
||||||
OSes.
|
|
||||||
|
|
||||||
The homepage of ACPICA project is: www.acpica.org, it is maintained and
|
|
||||||
supported by Intel Corporation.
|
|
||||||
|
|
||||||
The following figure depicts the Linux ACPI subsystem where the ACPICA
|
|
||||||
adaptation is included:
|
|
||||||
|
|
||||||
+---------------------------------------------------------+
|
|
||||||
| |
|
|
||||||
| +---------------------------------------------------+ |
|
|
||||||
| | +------------------+ | |
|
|
||||||
| | | Table Management | | |
|
|
||||||
| | +------------------+ | |
|
|
||||||
| | +----------------------+ | |
|
|
||||||
| | | Namespace Management | | |
|
|
||||||
| | +----------------------+ | |
|
|
||||||
| | +------------------+ ACPICA Components | |
|
|
||||||
| | | Event Management | | |
|
|
||||||
| | +------------------+ | |
|
|
||||||
| | +---------------------+ | |
|
|
||||||
| | | Resource Management | | |
|
|
||||||
| | +---------------------+ | |
|
|
||||||
| | +---------------------+ | |
|
|
||||||
| | | Hardware Management | | |
|
|
||||||
| | +---------------------+ | |
|
|
||||||
| +---------------------------------------------------+ | |
|
|
||||||
| | | +------------------+ | | |
|
|
||||||
| | | | OS Service Layer | | | |
|
|
||||||
| | | +------------------+ | | |
|
|
||||||
| | +-------------------------------------------------|-+ |
|
|
||||||
| | +--------------------+ | |
|
|
||||||
| | | Device Enumeration | | |
|
|
||||||
| | +--------------------+ | |
|
|
||||||
| | +------------------+ | |
|
|
||||||
| | | Power Management | | |
|
|
||||||
| | +------------------+ Linux/ACPI Components | |
|
|
||||||
| | +--------------------+ | |
|
|
||||||
| | | Thermal Management | | |
|
|
||||||
| | +--------------------+ | |
|
|
||||||
| | +--------------------------+ | |
|
|
||||||
| | | Drivers for ACPI Devices | | |
|
|
||||||
| | +--------------------------+ | |
|
|
||||||
| | +--------+ | |
|
|
||||||
| | | ...... | | |
|
|
||||||
| | +--------+ | |
|
|
||||||
| +---------------------------------------------------+ |
|
|
||||||
| |
|
|
||||||
+---------------------------------------------------------+
|
|
||||||
|
|
||||||
Figure 1. Linux ACPI Software Components
|
|
||||||
|
|
||||||
NOTE:
|
|
||||||
A. OS Service Layer - Provided by Linux to offer OS dependent
|
|
||||||
implementation of the predefined ACPICA interfaces (acpi_os_*).
|
|
||||||
include/acpi/acpiosxf.h
|
|
||||||
drivers/acpi/osl.c
|
|
||||||
include/acpi/platform
|
|
||||||
include/asm/acenv.h
|
|
||||||
B. ACPICA Functionality - Released from ACPICA code base to offer
|
|
||||||
OS independent implementation of the ACPICA interfaces (acpi_*).
|
|
||||||
drivers/acpi/acpica
|
|
||||||
include/acpi/ac*.h
|
|
||||||
tools/power/acpi
|
|
||||||
C. Linux/ACPI Functionality - Providing Linux specific ACPI
|
|
||||||
functionality to the other Linux kernel subsystems and user space
|
|
||||||
programs.
|
|
||||||
drivers/acpi
|
|
||||||
include/linux/acpi.h
|
|
||||||
include/linux/acpi*.h
|
|
||||||
include/acpi
|
|
||||||
tools/power/acpi
|
|
||||||
D. Architecture Specific ACPICA/ACPI Functionalities - Provided by the
|
|
||||||
ACPI subsystem to offer architecture specific implementation of the
|
|
||||||
ACPI interfaces. They are Linux specific components and are out of
|
|
||||||
the scope of this document.
|
|
||||||
include/asm/acpi.h
|
|
||||||
include/asm/acpi*.h
|
|
||||||
arch/*/acpi
|
|
||||||
|
|
||||||
2. ACPICA Release
|
|
||||||
|
|
||||||
The ACPICA project maintains its code base at the following repository URL:
|
|
||||||
https://github.com/acpica/acpica.git. As a rule, a release is made every
|
|
||||||
month.
|
|
||||||
|
|
||||||
As the coding style adopted by the ACPICA project is not acceptable by
|
|
||||||
Linux, there is a release process to convert the ACPICA git commits into
|
|
||||||
Linux patches. The patches generated by this process are referred to as
|
|
||||||
"linuxized ACPICA patches". The release process is carried out on a local
|
|
||||||
copy the ACPICA git repository. Each commit in the monthly release is
|
|
||||||
converted into a linuxized ACPICA patch. Together, they form the monthly
|
|
||||||
ACPICA release patchset for the Linux ACPI community. This process is
|
|
||||||
illustrated in the following figure:
|
|
||||||
|
|
||||||
+-----------------------------+
|
|
||||||
| acpica / master (-) commits |
|
|
||||||
+-----------------------------+
|
|
||||||
/|\ |
|
|
||||||
| \|/
|
|
||||||
| /---------------------\ +----------------------+
|
|
||||||
| < Linuxize repo Utility >-->| old linuxized acpica |--+
|
|
||||||
| \---------------------/ +----------------------+ |
|
|
||||||
| |
|
|
||||||
/---------\ |
|
|
||||||
< git reset > \
|
|
||||||
\---------/ \
|
|
||||||
/|\ /+-+
|
|
||||||
| / |
|
|
||||||
+-----------------------------+ | |
|
|
||||||
| acpica / master (+) commits | | |
|
|
||||||
+-----------------------------+ | |
|
|
||||||
| | |
|
|
||||||
\|/ | |
|
|
||||||
/-----------------------\ +----------------------+ | |
|
|
||||||
< Linuxize repo Utilities >-->| new linuxized acpica |--+ |
|
|
||||||
\-----------------------/ +----------------------+ |
|
|
||||||
\|/
|
|
||||||
+--------------------------+ /----------------------\
|
|
||||||
| Linuxized ACPICA Patches |<----------------< Linuxize patch Utility >
|
|
||||||
+--------------------------+ \----------------------/
|
|
||||||
|
|
|
||||||
\|/
|
|
||||||
/---------------------------\
|
|
||||||
< Linux ACPI Community Review >
|
|
||||||
\---------------------------/
|
|
||||||
|
|
|
||||||
\|/
|
|
||||||
+-----------------------+ /------------------\ +----------------+
|
|
||||||
| linux-pm / linux-next |-->< Linux Merge Window >-->| linux / master |
|
|
||||||
+-----------------------+ \------------------/ +----------------+
|
|
||||||
|
|
||||||
Figure 2. ACPICA -> Linux Upstream Process
|
|
||||||
|
|
||||||
NOTE:
|
|
||||||
A. Linuxize Utilities - Provided by the ACPICA repository, including a
|
|
||||||
utility located in source/tools/acpisrc folder and a number of
|
|
||||||
scripts located in generate/linux folder.
|
|
||||||
B. acpica / master - "master" branch of the git repository at
|
|
||||||
<https://github.com/acpica/acpica.git>.
|
|
||||||
C. linux-pm / linux-next - "linux-next" branch of the git repository at
|
|
||||||
<http://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git>.
|
|
||||||
D. linux / master - "master" branch of the git repository at
|
|
||||||
<http://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git>.
|
|
||||||
|
|
||||||
Before the linuxized ACPICA patches are sent to the Linux ACPI community
|
|
||||||
for review, there is a quality assurance build test process to reduce
|
|
||||||
porting issues. Currently this build process only takes care of the
|
|
||||||
following kernel configuration options:
|
|
||||||
CONFIG_ACPI/CONFIG_ACPI_DEBUG/CONFIG_ACPI_DEBUGGER
|
|
||||||
|
|
||||||
3. ACPICA Divergences
|
|
||||||
|
|
||||||
Ideally, all of the ACPICA commits should be converted into Linux patches
|
|
||||||
automatically without manual modifications, the "linux / master" tree should
|
|
||||||
contain the ACPICA code that exactly corresponds to the ACPICA code
|
|
||||||
contained in "new linuxized acpica" tree and it should be possible to run
|
|
||||||
the release process fully automatically.
|
|
||||||
|
|
||||||
As a matter of fact, however, there are source code differences between
|
|
||||||
the ACPICA code in Linux and the upstream ACPICA code, referred to as
|
|
||||||
"ACPICA Divergences".
|
|
||||||
|
|
||||||
The various sources of ACPICA divergences include:
|
|
||||||
1. Legacy divergences - Before the current ACPICA release process was
|
|
||||||
established, there already had been divergences between Linux and
|
|
||||||
ACPICA. Over the past several years those divergences have been greatly
|
|
||||||
reduced, but there still are several ones and it takes time to figure
|
|
||||||
out the underlying reasons for their existence.
|
|
||||||
2. Manual modifications - Any manual modification (eg. coding style fixes)
|
|
||||||
made directly in the Linux sources obviously hurts the ACPICA release
|
|
||||||
automation. Thus it is recommended to fix such issues in the ACPICA
|
|
||||||
upstream source code and generate the linuxized fix using the ACPICA
|
|
||||||
release utilities (please refer to Section 4 below for the details).
|
|
||||||
3. Linux specific features - Sometimes it's impossible to use the
|
|
||||||
current ACPICA APIs to implement features required by the Linux kernel,
|
|
||||||
so Linux developers occasionally have to change ACPICA code directly.
|
|
||||||
Those changes may not be acceptable by ACPICA upstream and in such cases
|
|
||||||
they are left as committed ACPICA divergences unless the ACPICA side can
|
|
||||||
implement new mechanisms as replacements for them.
|
|
||||||
4. ACPICA release fixups - ACPICA only tests commits using a set of the
|
|
||||||
user space simulation utilities, thus the linuxized ACPICA patches may
|
|
||||||
break the Linux kernel, leaving us build/boot failures. In order to
|
|
||||||
avoid breaking Linux bisection, fixes are applied directly to the
|
|
||||||
linuxized ACPICA patches during the release process. When the release
|
|
||||||
fixups are backported to the upstream ACPICA sources, they must follow
|
|
||||||
the upstream ACPICA rules and so further modifications may appear.
|
|
||||||
That may result in the appearance of new divergences.
|
|
||||||
5. Fast tracking of ACPICA commits - Some ACPICA commits are regression
|
|
||||||
fixes or stable-candidate material, so they are applied in advance with
|
|
||||||
respect to the ACPICA release process. If such commits are reverted or
|
|
||||||
rebased on the ACPICA side in order to offer better solutions, new ACPICA
|
|
||||||
divergences are generated.
|
|
||||||
|
|
||||||
4. ACPICA Development
|
|
||||||
|
|
||||||
This paragraph guides Linux developers to use the ACPICA upstream release
|
|
||||||
utilities to obtain Linux patches corresponding to upstream ACPICA commits
|
|
||||||
before they become available from the ACPICA release process.
|
|
||||||
|
|
||||||
1. Cherry-pick an ACPICA commit
|
|
||||||
|
|
||||||
First you need to git clone the ACPICA repository and the ACPICA change
|
|
||||||
you want to cherry pick must be committed into the local repository.
|
|
||||||
|
|
||||||
Then the gen-patch.sh command can help to cherry-pick an ACPICA commit
|
|
||||||
from the ACPICA local repository:
|
|
||||||
|
|
||||||
$ git clone https://github.com/acpica/acpica
|
|
||||||
$ cd acpica
|
|
||||||
$ generate/linux/gen-patch.sh -u [commit ID]
|
|
||||||
|
|
||||||
Here the commit ID is the ACPICA local repository commit ID you want to
|
|
||||||
cherry pick. It can be omitted if the commit is "HEAD".
|
|
||||||
|
|
||||||
2. Cherry-pick recent ACPICA commits
|
|
||||||
|
|
||||||
Sometimes you need to rebase your code on top of the most recent ACPICA
|
|
||||||
changes that haven't been applied to Linux yet.
|
|
||||||
|
|
||||||
You can generate the ACPICA release series yourself and rebase your code on
|
|
||||||
top of the generated ACPICA release patches:
|
|
||||||
|
|
||||||
$ git clone https://github.com/acpica/acpica
|
|
||||||
$ cd acpica
|
|
||||||
$ generate/linux/make-patches.sh -u [commit ID]
|
|
||||||
|
|
||||||
The commit ID should be the last ACPICA commit accepted by Linux. Usually,
|
|
||||||
it is the commit modifying ACPI_CA_VERSION. It can be found by executing
|
|
||||||
"git blame source/include/acpixf.h" and referencing the line that contains
|
|
||||||
"ACPI_CA_VERSION".
|
|
||||||
|
|
||||||
3. Inspect the current divergences
|
|
||||||
|
|
||||||
If you have local copies of both Linux and upstream ACPICA, you can generate
|
|
||||||
a diff file indicating the state of the current divergences:
|
|
||||||
|
|
||||||
# git clone https://github.com/acpica/acpica
|
|
||||||
# git clone http://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
|
|
||||||
# cd acpica
|
|
||||||
# generate/linux/divergences.sh -s ../linux
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
To enumerate platform Low Power Idle states, Intel platforms are using
|
|
||||||
“Low Power Idle Table” (LPIT). More details about this table can be
|
|
||||||
downloaded from:
|
|
||||||
http://www.uefi.org/sites/default/files/resources/Intel_ACPI_Low_Power_S0_Idle.pdf
|
|
||||||
|
|
||||||
Residencies for each low power state can be read via FFH
|
|
||||||
(Function fixed hardware) or a memory mapped interface.
|
|
||||||
|
|
||||||
On platforms supporting S0ix sleep states, there can be two types of
|
|
||||||
residencies:
|
|
||||||
- CPU PKG C10 (Read via FFH interface)
|
|
||||||
- Platform Controller Hub (PCH) SLP_S0 (Read via memory mapped interface)
|
|
||||||
|
|
||||||
The following attributes are added dynamically to the cpuidle
|
|
||||||
sysfs attribute group:
|
|
||||||
/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
|
|
||||||
/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us
|
|
||||||
|
|
||||||
The "low_power_idle_cpu_residency_us" attribute shows time spent
|
|
||||||
by the CPU package in PKG C10
|
|
||||||
|
|
||||||
The "low_power_idle_system_residency_us" attribute shows SLP_S0
|
|
||||||
residency, or system time spent with the SLP_S0# signal asserted.
|
|
||||||
This is the lowest possible system power state, achieved only when CPU is in
|
|
||||||
PKG C10 and all functional blocks in PCH are in a low power state.
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
Linux ACPI Custom Control Method How To
|
|
||||||
=======================================
|
|
||||||
|
|
||||||
Written by Zhang Rui <rui.zhang@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Linux supports customizing ACPI control methods at runtime.
|
|
||||||
|
|
||||||
Users can use this to
|
|
||||||
1. override an existing method which may not work correctly,
|
|
||||||
or just for debugging purposes.
|
|
||||||
2. insert a completely new method in order to create a missing
|
|
||||||
method such as _OFF, _ON, _STA, _INI, etc.
|
|
||||||
For these cases, it is far simpler to dynamically install a single
|
|
||||||
control method rather than override the entire DSDT, because kernel
|
|
||||||
rebuild/reboot is not needed and test result can be got in minutes.
|
|
||||||
|
|
||||||
Note: Only ACPI METHOD can be overridden, any other object types like
|
|
||||||
"Device", "OperationRegion", are not recognized. Methods
|
|
||||||
declared inside scope operators are also not supported.
|
|
||||||
Note: The same ACPI control method can be overridden for many times,
|
|
||||||
and it's always the latest one that used by Linux/kernel.
|
|
||||||
Note: To get the ACPI debug object output (Store (AAAA, Debug)),
|
|
||||||
please run "echo 1 > /sys/module/acpi/parameters/aml_debug_output".
|
|
||||||
|
|
||||||
1. override an existing method
|
|
||||||
a) get the ACPI table via ACPI sysfs I/F. e.g. to get the DSDT,
|
|
||||||
just run "cat /sys/firmware/acpi/tables/DSDT > /tmp/dsdt.dat"
|
|
||||||
b) disassemble the table by running "iasl -d dsdt.dat".
|
|
||||||
c) rewrite the ASL code of the method and save it in a new file,
|
|
||||||
d) package the new file (psr.asl) to an ACPI table format.
|
|
||||||
Here is an example of a customized \_SB._AC._PSR method,
|
|
||||||
|
|
||||||
DefinitionBlock ("", "SSDT", 1, "", "", 0x20080715)
|
|
||||||
{
|
|
||||||
Method (\_SB_.AC._PSR, 0, NotSerialized)
|
|
||||||
{
|
|
||||||
Store ("In AC _PSR", Debug)
|
|
||||||
Return (ACON)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Note that the full pathname of the method in ACPI namespace
|
|
||||||
should be used.
|
|
||||||
e) assemble the file to generate the AML code of the method.
|
|
||||||
e.g. "iasl -vw 6084 psr.asl" (psr.aml is generated as a result)
|
|
||||||
If parameter "-vw 6084" is not supported by your iASL compiler,
|
|
||||||
please try a newer version.
|
|
||||||
f) mount debugfs by "mount -t debugfs none /sys/kernel/debug"
|
|
||||||
g) override the old method via the debugfs by running
|
|
||||||
"cat /tmp/psr.aml > /sys/kernel/debug/acpi/custom_method"
|
|
||||||
|
|
||||||
2. insert a new method
|
|
||||||
This is easier than overriding an existing method.
|
|
||||||
We just need to create the ASL code of the method we want to
|
|
||||||
insert and then follow the step c) ~ g) in section 1.
|
|
||||||
|
|
||||||
3. undo your changes
|
|
||||||
The "undo" operation is not supported for a new inserted method
|
|
||||||
right now, i.e. we can not remove a method currently.
|
|
||||||
For an overridden method, in order to undo your changes, please
|
|
||||||
save a copy of the method original ASL code in step c) section 1,
|
|
||||||
and redo step c) ~ g) to override the method with the original one.
|
|
||||||
|
|
||||||
|
|
||||||
Note: We can use a kernel with multiple custom ACPI method running,
|
|
||||||
But each individual write to debugfs can implement a SINGLE
|
|
||||||
method override. i.e. if we want to insert/override multiple
|
|
||||||
ACPI methods, we need to redo step c) ~ g) for multiple times.
|
|
||||||
|
|
||||||
Note: Be aware that root can mis-use this driver to modify arbitrary
|
|
||||||
memory and gain additional rights, if root's privileges got
|
|
||||||
restricted (for example if root is not allowed to load additional
|
|
||||||
modules after boot).
|
|
||||||
@@ -1,192 +0,0 @@
|
|||||||
ACPICA Trace Facility
|
|
||||||
|
|
||||||
Copyright (C) 2015, Intel Corporation
|
|
||||||
Author: Lv Zheng <lv.zheng@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Abstract:
|
|
||||||
|
|
||||||
This document describes the functions and the interfaces of the method
|
|
||||||
tracing facility.
|
|
||||||
|
|
||||||
1. Functionalities and usage examples:
|
|
||||||
|
|
||||||
ACPICA provides method tracing capability. And two functions are
|
|
||||||
currently implemented using this capability.
|
|
||||||
|
|
||||||
A. Log reducer
|
|
||||||
ACPICA subsystem provides debugging outputs when CONFIG_ACPI_DEBUG is
|
|
||||||
enabled. The debugging messages which are deployed via
|
|
||||||
ACPI_DEBUG_PRINT() macro can be reduced at 2 levels - per-component
|
|
||||||
level (known as debug layer, configured via
|
|
||||||
/sys/module/acpi/parameters/debug_layer) and per-type level (known as
|
|
||||||
debug level, configured via /sys/module/acpi/parameters/debug_level).
|
|
||||||
|
|
||||||
But when the particular layer/level is applied to the control method
|
|
||||||
evaluations, the quantity of the debugging outputs may still be too
|
|
||||||
large to be put into the kernel log buffer. The idea thus is worked out
|
|
||||||
to only enable the particular debug layer/level (normally more detailed)
|
|
||||||
logs when the control method evaluation is started, and disable the
|
|
||||||
detailed logging when the control method evaluation is stopped.
|
|
||||||
|
|
||||||
The following command examples illustrate the usage of the "log reducer"
|
|
||||||
functionality:
|
|
||||||
a. Filter out the debug layer/level matched logs when control methods
|
|
||||||
are being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
|
||||||
# echo "0xYYYYYYYY" > trace_debug_level
|
|
||||||
# echo "enable" > trace_state
|
|
||||||
b. Filter out the debug layer/level matched logs when the specified
|
|
||||||
control method is being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
|
||||||
# echo "0xYYYYYYYY" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method" > /sys/module/acpi/parameters/trace_state
|
|
||||||
c. Filter out the debug layer/level matched logs when the specified
|
|
||||||
control method is being evaluated for the first time:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0xXXXXXXXX" > trace_debug_layer
|
|
||||||
# echo "0xYYYYYYYY" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method-once" > /sys/module/acpi/parameters/trace_state
|
|
||||||
Where:
|
|
||||||
0xXXXXXXXX/0xYYYYYYYY: Refer to Documentation/acpi/debug.txt for
|
|
||||||
possible debug layer/level masking values.
|
|
||||||
\PPPP.AAAA.TTTT.HHHH: Full path of a control method that can be found
|
|
||||||
in the ACPI namespace. It needn't be an entry
|
|
||||||
of a control method evaluation.
|
|
||||||
|
|
||||||
B. AML tracer
|
|
||||||
|
|
||||||
There are special log entries added by the method tracing facility at
|
|
||||||
the "trace points" the AML interpreter starts/stops to execute a control
|
|
||||||
method, or an AML opcode. Note that the format of the log entries are
|
|
||||||
subject to change:
|
|
||||||
[ 0.186427] exdebug-0398 ex_trace_point : Method Begin [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution.
|
|
||||||
[ 0.186630] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905c88:If] execution.
|
|
||||||
[ 0.186820] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:LEqual] execution.
|
|
||||||
[ 0.187010] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905a20:-NamePath-] execution.
|
|
||||||
[ 0.187214] exdebug-0398 ex_trace_point : Opcode End [0xf5905a20:-NamePath-] execution.
|
|
||||||
[ 0.187407] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution.
|
|
||||||
[ 0.187594] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution.
|
|
||||||
[ 0.187789] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:LEqual] execution.
|
|
||||||
[ 0.187980] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905cc0:Return] execution.
|
|
||||||
[ 0.188146] exdebug-0398 ex_trace_point : Opcode Begin [0xf5905f60:One] execution.
|
|
||||||
[ 0.188334] exdebug-0398 ex_trace_point : Opcode End [0xf5905f60:One] execution.
|
|
||||||
[ 0.188524] exdebug-0398 ex_trace_point : Opcode End [0xf5905cc0:Return] execution.
|
|
||||||
[ 0.188712] exdebug-0398 ex_trace_point : Opcode End [0xf5905c88:If] execution.
|
|
||||||
[ 0.188903] exdebug-0398 ex_trace_point : Method End [0xf58394d8:\_SB.PCI0.LPCB.ECOK] execution.
|
|
||||||
|
|
||||||
Developers can utilize these special log entries to track the AML
|
|
||||||
interpretion, thus can aid issue debugging and performance tuning. Note
|
|
||||||
that, as the "AML tracer" logs are implemented via ACPI_DEBUG_PRINT()
|
|
||||||
macro, CONFIG_ACPI_DEBUG is also required to be enabled for enabling
|
|
||||||
"AML tracer" logs.
|
|
||||||
|
|
||||||
The following command examples illustrate the usage of the "AML tracer"
|
|
||||||
functionality:
|
|
||||||
a. Filter out the method start/stop "AML tracer" logs when control
|
|
||||||
methods are being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "enable" > trace_state
|
|
||||||
b. Filter out the method start/stop "AML tracer" when the specified
|
|
||||||
control method is being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method" > trace_state
|
|
||||||
c. Filter out the method start/stop "AML tracer" logs when the specified
|
|
||||||
control method is being evaluated for the first time:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "method-once" > trace_state
|
|
||||||
d. Filter out the method/opcode start/stop "AML tracer" when the
|
|
||||||
specified control method is being evaluated:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "opcode" > trace_state
|
|
||||||
e. Filter out the method/opcode start/stop "AML tracer" when the
|
|
||||||
specified control method is being evaluated for the first time:
|
|
||||||
# cd /sys/module/acpi/parameters
|
|
||||||
# echo "0x80" > trace_debug_layer
|
|
||||||
# echo "0x10" > trace_debug_level
|
|
||||||
# echo "\PPPP.AAAA.TTTT.HHHH" > trace_method_name
|
|
||||||
# echo "opcode-opcode" > trace_state
|
|
||||||
|
|
||||||
Note that all above method tracing facility related module parameters can
|
|
||||||
be used as the boot parameters, for example:
|
|
||||||
acpi.trace_debug_layer=0x80 acpi.trace_debug_level=0x10 \
|
|
||||||
acpi.trace_method_name=\_SB.LID0._LID acpi.trace_state=opcode-once
|
|
||||||
|
|
||||||
2. Interface descriptions:
|
|
||||||
|
|
||||||
All method tracing functions can be configured via ACPI module
|
|
||||||
parameters that are accessible at /sys/module/acpi/parameters/:
|
|
||||||
|
|
||||||
trace_method_name
|
|
||||||
The full path of the AML method that the user wants to trace.
|
|
||||||
Note that the full path shouldn't contain the trailing "_"s in its
|
|
||||||
name segments but may contain "\" to form an absolute path.
|
|
||||||
|
|
||||||
trace_debug_layer
|
|
||||||
The temporary debug_layer used when the tracing feature is enabled.
|
|
||||||
Using ACPI_EXECUTER (0x80) by default, which is the debug_layer
|
|
||||||
used to match all "AML tracer" logs.
|
|
||||||
|
|
||||||
trace_debug_level
|
|
||||||
The temporary debug_level used when the tracing feature is enabled.
|
|
||||||
Using ACPI_LV_TRACE_POINT (0x10) by default, which is the
|
|
||||||
debug_level used to match all "AML tracer" logs.
|
|
||||||
|
|
||||||
trace_state
|
|
||||||
The status of the tracing feature.
|
|
||||||
Users can enable/disable this debug tracing feature by executing
|
|
||||||
the following command:
|
|
||||||
# echo string > /sys/module/acpi/parameters/trace_state
|
|
||||||
Where "string" should be one of the following:
|
|
||||||
"disable"
|
|
||||||
Disable the method tracing feature.
|
|
||||||
"enable"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during any method
|
|
||||||
execution will be logged.
|
|
||||||
"method"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method execution
|
|
||||||
of "trace_method_name" will be logged.
|
|
||||||
"method-once"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method execution
|
|
||||||
of "trace_method_name" will be logged only once.
|
|
||||||
"opcode"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method/opcode
|
|
||||||
execution of "trace_method_name" will be logged.
|
|
||||||
"opcode-once"
|
|
||||||
Enable the method tracing feature.
|
|
||||||
ACPICA debugging messages matching
|
|
||||||
"trace_debug_layer/trace_debug_level" during method/opcode
|
|
||||||
execution of "trace_method_name" will be logged only once.
|
|
||||||
Note that, the difference between the "enable" and other feature
|
|
||||||
enabling options are:
|
|
||||||
1. When "enable" is specified, since
|
|
||||||
"trace_debug_layer/trace_debug_level" shall apply to all control
|
|
||||||
method evaluations, after configuring "trace_state" to "enable",
|
|
||||||
"trace_method_name" will be reset to NULL.
|
|
||||||
2. When "method/opcode" is specified, if
|
|
||||||
"trace_method_name" is NULL when "trace_state" is configured to
|
|
||||||
these options, the "trace_debug_layer/trace_debug_level" will
|
|
||||||
apply to all control method evaluations.
|
|
||||||
@@ -1,388 +0,0 @@
|
|||||||
ACPI Device Tree - Representation of ACPI Namespace
|
|
||||||
|
|
||||||
Copyright (C) 2013, Intel Corporation
|
|
||||||
Author: Lv Zheng <lv.zheng@intel.com>
|
|
||||||
|
|
||||||
|
|
||||||
Abstract:
|
|
||||||
|
|
||||||
The Linux ACPI subsystem converts ACPI namespace objects into a Linux
|
|
||||||
device tree under the /sys/devices/LNXSYSTEM:00 and updates it upon
|
|
||||||
receiving ACPI hotplug notification events. For each device object in this
|
|
||||||
hierarchy there is a corresponding symbolic link in the
|
|
||||||
/sys/bus/acpi/devices.
|
|
||||||
This document illustrates the structure of the ACPI device tree.
|
|
||||||
|
|
||||||
|
|
||||||
Credit:
|
|
||||||
|
|
||||||
Thanks for the help from Zhang Rui <rui.zhang@intel.com> and Rafael J.
|
|
||||||
Wysocki <rafael.j.wysocki@intel.com>.
|
|
||||||
|
|
||||||
|
|
||||||
1. ACPI Definition Blocks
|
|
||||||
|
|
||||||
The ACPI firmware sets up RSDP (Root System Description Pointer) in the
|
|
||||||
system memory address space pointing to the XSDT (Extended System
|
|
||||||
Description Table). The XSDT always points to the FADT (Fixed ACPI
|
|
||||||
Description Table) using its first entry, the data within the FADT
|
|
||||||
includes various fixed-length entries that describe fixed ACPI features
|
|
||||||
of the hardware. The FADT contains a pointer to the DSDT
|
|
||||||
(Differentiated System Descripition Table). The XSDT also contains
|
|
||||||
entries pointing to possibly multiple SSDTs (Secondary System
|
|
||||||
Description Table).
|
|
||||||
|
|
||||||
The DSDT and SSDT data is organized in data structures called definition
|
|
||||||
blocks that contain definitions of various objects, including ACPI
|
|
||||||
control methods, encoded in AML (ACPI Machine Language). The data block
|
|
||||||
of the DSDT along with the contents of SSDTs represents a hierarchical
|
|
||||||
data structure called the ACPI namespace whose topology reflects the
|
|
||||||
structure of the underlying hardware platform.
|
|
||||||
|
|
||||||
The relationships between ACPI System Definition Tables described above
|
|
||||||
are illustrated in the following diagram.
|
|
||||||
|
|
||||||
+---------+ +-------+ +--------+ +------------------------+
|
|
||||||
| RSDP | +->| XSDT | +->| FADT | | +-------------------+ |
|
|
||||||
+---------+ | +-------+ | +--------+ +-|->| DSDT | |
|
|
||||||
| Pointer | | | Entry |-+ | ...... | | | +-------------------+ |
|
|
||||||
+---------+ | +-------+ | X_DSDT |--+ | | Definition Blocks | |
|
|
||||||
| Pointer |-+ | ..... | | ...... | | +-------------------+ |
|
|
||||||
+---------+ +-------+ +--------+ | +-------------------+ |
|
|
||||||
| Entry |------------------|->| SSDT | |
|
|
||||||
+- - - -+ | +-------------------| |
|
|
||||||
| Entry | - - - - - - - -+ | | Definition Blocks | |
|
|
||||||
+- - - -+ | | +-------------------+ |
|
|
||||||
| | +- - - - - - - - - -+ |
|
|
||||||
+-|->| SSDT | |
|
|
||||||
| +-------------------+ |
|
|
||||||
| | Definition Blocks | |
|
|
||||||
| +- - - - - - - - - -+ |
|
|
||||||
+------------------------+
|
|
||||||
|
|
|
||||||
OSPM Loading |
|
|
||||||
\|/
|
|
||||||
+----------------+
|
|
||||||
| ACPI Namespace |
|
|
||||||
+----------------+
|
|
||||||
|
|
||||||
Figure 1. ACPI Definition Blocks
|
|
||||||
|
|
||||||
NOTE: RSDP can also contain a pointer to the RSDT (Root System
|
|
||||||
Description Table). Platforms provide RSDT to enable
|
|
||||||
compatibility with ACPI 1.0 operating systems. The OS is expected
|
|
||||||
to use XSDT, if present.
|
|
||||||
|
|
||||||
|
|
||||||
2. Example ACPI Namespace
|
|
||||||
|
|
||||||
All definition blocks are loaded into a single namespace. The namespace
|
|
||||||
is a hierarchy of objects identified by names and paths.
|
|
||||||
The following naming conventions apply to object names in the ACPI
|
|
||||||
namespace:
|
|
||||||
1. All names are 32 bits long.
|
|
||||||
2. The first byte of a name must be one of 'A' - 'Z', '_'.
|
|
||||||
3. Each of the remaining bytes of a name must be one of 'A' - 'Z', '0'
|
|
||||||
- '9', '_'.
|
|
||||||
4. Names starting with '_' are reserved by the ACPI specification.
|
|
||||||
5. The '\' symbol represents the root of the namespace (i.e. names
|
|
||||||
prepended with '\' are relative to the namespace root).
|
|
||||||
6. The '^' symbol represents the parent of the current namespace node
|
|
||||||
(i.e. names prepended with '^' are relative to the parent of the
|
|
||||||
current namespace node).
|
|
||||||
|
|
||||||
The figure below shows an example ACPI namespace.
|
|
||||||
|
|
||||||
+------+
|
|
||||||
| \ | Root
|
|
||||||
+------+
|
|
||||||
|
|
|
||||||
| +------+
|
|
||||||
+-| _PR | Scope(_PR): the processor namespace
|
|
||||||
| +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| CPU0 | Processor(CPU0): the first processor
|
|
||||||
| +------+
|
|
||||||
|
|
|
||||||
| +------+
|
|
||||||
+-| _SB | Scope(_SB): the system bus namespace
|
|
||||||
| +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| LID0 | Device(LID0); the lid device
|
|
||||||
| | +------+
|
|
||||||
| | |
|
|
||||||
| | | +------+
|
|
||||||
| | +-| _HID | Name(_HID, "PNP0C0D"): the hardware ID
|
|
||||||
| | | +------+
|
|
||||||
| | |
|
|
||||||
| | | +------+
|
|
||||||
| | +-| _STA | Method(_STA): the status control method
|
|
||||||
| | +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| PCI0 | Device(PCI0); the PCI root bridge
|
|
||||||
| +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| _HID | Name(_HID, "PNP0A08"): the hardware ID
|
|
||||||
| | +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| _CID | Name(_CID, "PNP0A03"): the compatible ID
|
|
||||||
| | +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| RP03 | Scope(RP03): the PCI0 power scope
|
|
||||||
| | +------+
|
|
||||||
| | |
|
|
||||||
| | | +------+
|
|
||||||
| | +-| PXP3 | PowerResource(PXP3): the PCI0 power resource
|
|
||||||
| | +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| GFX0 | Device(GFX0): the graphics adapter
|
|
||||||
| +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| _ADR | Name(_ADR, 0x00020000): the PCI bus address
|
|
||||||
| | +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| DD01 | Device(DD01): the LCD output device
|
|
||||||
| +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| _BCL | Method(_BCL): the backlight control method
|
|
||||||
| +------+
|
|
||||||
|
|
|
||||||
| +------+
|
|
||||||
+-| _TZ | Scope(_TZ): the thermal zone namespace
|
|
||||||
| +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| FN00 | PowerResource(FN00): the FAN0 power resource
|
|
||||||
| | +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| FAN0 | Device(FAN0): the FAN0 cooling device
|
|
||||||
| | +------+
|
|
||||||
| | |
|
|
||||||
| | | +------+
|
|
||||||
| | +-| _HID | Name(_HID, "PNP0A0B"): the hardware ID
|
|
||||||
| | +------+
|
|
||||||
| |
|
|
||||||
| | +------+
|
|
||||||
| +-| TZ00 | ThermalZone(TZ00); the FAN thermal zone
|
|
||||||
| +------+
|
|
||||||
|
|
|
||||||
| +------+
|
|
||||||
+-| _GPE | Scope(_GPE): the GPE namespace
|
|
||||||
+------+
|
|
||||||
|
|
||||||
Figure 2. Example ACPI Namespace
|
|
||||||
|
|
||||||
|
|
||||||
3. Linux ACPI Device Objects
|
|
||||||
|
|
||||||
The Linux kernel's core ACPI subsystem creates struct acpi_device
|
|
||||||
objects for ACPI namespace objects representing devices, power resources
|
|
||||||
processors, thermal zones. Those objects are exported to user space via
|
|
||||||
sysfs as directories in the subtree under /sys/devices/LNXSYSTM:00. The
|
|
||||||
format of their names is <bus_id:instance>, where 'bus_id' refers to the
|
|
||||||
ACPI namespace representation of the given object and 'instance' is used
|
|
||||||
for distinguishing different object of the same 'bus_id' (it is
|
|
||||||
two-digit decimal representation of an unsigned integer).
|
|
||||||
|
|
||||||
The value of 'bus_id' depends on the type of the object whose name it is
|
|
||||||
part of as listed in the table below.
|
|
||||||
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| | Object/Feature | Table | bus_id |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| N | Root | xSDT | LNXSYSTM |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| N | Device | xSDT | _HID |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| N | Processor | xSDT | LNXCPU |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| N | ThermalZone | xSDT | LNXTHERM |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| N | PowerResource | xSDT | LNXPOWER |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| N | Other Devices | xSDT | device |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| F | PWR_BUTTON | FADT | LNXPWRBN |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| F | SLP_BUTTON | FADT | LNXSLPBN |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| M | Video Extension | xSDT | LNXVIDEO |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| M | ATA Controller | xSDT | LNXIOBAY |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
| M | Docking Station | xSDT | LNXDOCK |
|
|
||||||
+---+-----------------+-------+----------+
|
|
||||||
|
|
||||||
Table 1. ACPI Namespace Objects Mapping
|
|
||||||
|
|
||||||
The following rules apply when creating struct acpi_device objects on
|
|
||||||
the basis of the contents of ACPI System Description Tables (as
|
|
||||||
indicated by the letter in the first column and the notation in the
|
|
||||||
second column of the table above):
|
|
||||||
N:
|
|
||||||
The object's source is an ACPI namespace node (as indicated by the
|
|
||||||
named object's type in the second column). In that case the object's
|
|
||||||
directory in sysfs will contain the 'path' attribute whose value is
|
|
||||||
the full path to the node from the namespace root.
|
|
||||||
F:
|
|
||||||
The struct acpi_device object is created for a fixed hardware
|
|
||||||
feature (as indicated by the fixed feature flag's name in the second
|
|
||||||
column), so its sysfs directory will not contain the 'path'
|
|
||||||
attribute.
|
|
||||||
M:
|
|
||||||
The struct acpi_device object is created for an ACPI namespace node
|
|
||||||
with specific control methods (as indicated by the ACPI defined
|
|
||||||
device's type in the second column). The 'path' attribute containing
|
|
||||||
its namespace path will be present in its sysfs directory. For
|
|
||||||
example, if the _BCL method is present for an ACPI namespace node, a
|
|
||||||
struct acpi_device object with LNXVIDEO 'bus_id' will be created for
|
|
||||||
it.
|
|
||||||
|
|
||||||
The third column of the above table indicates which ACPI System
|
|
||||||
Description Tables contain information used for the creation of the
|
|
||||||
struct acpi_device objects represented by the given row (xSDT means DSDT
|
|
||||||
or SSDT).
|
|
||||||
|
|
||||||
The forth column of the above table indicates the 'bus_id' generation
|
|
||||||
rule of the struct acpi_device object:
|
|
||||||
_HID:
|
|
||||||
_HID in the last column of the table means that the object's bus_id
|
|
||||||
is derived from the _HID/_CID identification objects present under
|
|
||||||
the corresponding ACPI namespace node. The object's sysfs directory
|
|
||||||
will then contain the 'hid' and 'modalias' attributes that can be
|
|
||||||
used to retrieve the _HID and _CIDs of that object.
|
|
||||||
LNXxxxxx:
|
|
||||||
The 'modalias' attribute is also present for struct acpi_device
|
|
||||||
objects having bus_id of the "LNXxxxxx" form (pseudo devices), in
|
|
||||||
which cases it contains the bus_id string itself.
|
|
||||||
device:
|
|
||||||
'device' in the last column of the table indicates that the object's
|
|
||||||
bus_id cannot be determined from _HID/_CID of the corresponding
|
|
||||||
ACPI namespace node, although that object represents a device (for
|
|
||||||
example, it may be a PCI device with _ADR defined and without _HID
|
|
||||||
or _CID). In that case the string 'device' will be used as the
|
|
||||||
object's bus_id.
|
|
||||||
|
|
||||||
|
|
||||||
4. Linux ACPI Physical Device Glue
|
|
||||||
|
|
||||||
ACPI device (i.e. struct acpi_device) objects may be linked to other
|
|
||||||
objects in the Linux' device hierarchy that represent "physical" devices
|
|
||||||
(for example, devices on the PCI bus). If that happens, it means that
|
|
||||||
the ACPI device object is a "companion" of a device otherwise
|
|
||||||
represented in a different way and is used (1) to provide configuration
|
|
||||||
information on that device which cannot be obtained by other means and
|
|
||||||
(2) to do specific things to the device with the help of its ACPI
|
|
||||||
control methods. One ACPI device object may be linked this way to
|
|
||||||
multiple "physical" devices.
|
|
||||||
|
|
||||||
If an ACPI device object is linked to a "physical" device, its sysfs
|
|
||||||
directory contains the "physical_node" symbolic link to the sysfs
|
|
||||||
directory of the target device object. In turn, the target device's
|
|
||||||
sysfs directory will then contain the "firmware_node" symbolic link to
|
|
||||||
the sysfs directory of the companion ACPI device object.
|
|
||||||
The linking mechanism relies on device identification provided by the
|
|
||||||
ACPI namespace. For example, if there's an ACPI namespace object
|
|
||||||
representing a PCI device (i.e. a device object under an ACPI namespace
|
|
||||||
object representing a PCI bridge) whose _ADR returns 0x00020000 and the
|
|
||||||
bus number of the parent PCI bridge is 0, the sysfs directory
|
|
||||||
representing the struct acpi_device object created for that ACPI
|
|
||||||
namespace object will contain the 'physical_node' symbolic link to the
|
|
||||||
/sys/devices/pci0000:00/0000:00:02:0/ sysfs directory of the
|
|
||||||
corresponding PCI device.
|
|
||||||
|
|
||||||
The linking mechanism is generally bus-specific. The core of its
|
|
||||||
implementation is located in the drivers/acpi/glue.c file, but there are
|
|
||||||
complementary parts depending on the bus types in question located
|
|
||||||
elsewhere. For example, the PCI-specific part of it is located in
|
|
||||||
drivers/pci/pci-acpi.c.
|
|
||||||
|
|
||||||
|
|
||||||
5. Example Linux ACPI Device Tree
|
|
||||||
|
|
||||||
The sysfs hierarchy of struct acpi_device objects corresponding to the
|
|
||||||
example ACPI namespace illustrated in Figure 2 with the addition of
|
|
||||||
fixed PWR_BUTTON/SLP_BUTTON devices is shown below.
|
|
||||||
|
|
||||||
+--------------+---+-----------------+
|
|
||||||
| LNXSYSTEM:00 | \ | acpi:LNXSYSTEM: |
|
|
||||||
+--------------+---+-----------------+
|
|
||||||
|
|
|
||||||
| +-------------+-----+----------------+
|
|
||||||
+-| LNXPWRBN:00 | N/A | acpi:LNXPWRBN: |
|
|
||||||
| +-------------+-----+----------------+
|
|
||||||
|
|
|
||||||
| +-------------+-----+----------------+
|
|
||||||
+-| LNXSLPBN:00 | N/A | acpi:LNXSLPBN: |
|
|
||||||
| +-------------+-----+----------------+
|
|
||||||
|
|
|
||||||
| +-----------+------------+--------------+
|
|
||||||
+-| LNXCPU:00 | \_PR_.CPU0 | acpi:LNXCPU: |
|
|
||||||
| +-----------+------------+--------------+
|
|
||||||
|
|
|
||||||
| +-------------+-------+----------------+
|
|
||||||
+-| LNXSYBUS:00 | \_SB_ | acpi:LNXSYBUS: |
|
|
||||||
| +-------------+-------+----------------+
|
|
||||||
| |
|
|
||||||
| | +- - - - - - - +- - - - - - +- - - - - - - -+
|
|
||||||
| +-| PNP0C0D:00 | \_SB_.LID0 | acpi:PNP0C0D: |
|
|
||||||
| | +- - - - - - - +- - - - - - +- - - - - - - -+
|
|
||||||
| |
|
|
||||||
| | +------------+------------+-----------------------+
|
|
||||||
| +-| PNP0A08:00 | \_SB_.PCI0 | acpi:PNP0A08:PNP0A03: |
|
|
||||||
| +------------+------------+-----------------------+
|
|
||||||
| |
|
|
||||||
| | +-----------+-----------------+-----+
|
|
||||||
| +-| device:00 | \_SB_.PCI0.RP03 | N/A |
|
|
||||||
| | +-----------+-----------------+-----+
|
|
||||||
| | |
|
|
||||||
| | | +-------------+----------------------+----------------+
|
|
||||||
| | +-| LNXPOWER:00 | \_SB_.PCI0.RP03.PXP3 | acpi:LNXPOWER: |
|
|
||||||
| | +-------------+----------------------+----------------+
|
|
||||||
| |
|
|
||||||
| | +-------------+-----------------+----------------+
|
|
||||||
| +-| LNXVIDEO:00 | \_SB_.PCI0.GFX0 | acpi:LNXVIDEO: |
|
|
||||||
| +-------------+-----------------+----------------+
|
|
||||||
| |
|
|
||||||
| | +-----------+-----------------+-----+
|
|
||||||
| +-| device:01 | \_SB_.PCI0.DD01 | N/A |
|
|
||||||
| +-----------+-----------------+-----+
|
|
||||||
|
|
|
||||||
| +-------------+-------+----------------+
|
|
||||||
+-| LNXSYBUS:01 | \_TZ_ | acpi:LNXSYBUS: |
|
|
||||||
+-------------+-------+----------------+
|
|
||||||
|
|
|
||||||
| +-------------+------------+----------------+
|
|
||||||
+-| LNXPOWER:0a | \_TZ_.FN00 | acpi:LNXPOWER: |
|
|
||||||
| +-------------+------------+----------------+
|
|
||||||
|
|
|
||||||
| +------------+------------+---------------+
|
|
||||||
+-| PNP0C0B:00 | \_TZ_.FAN0 | acpi:PNP0C0B: |
|
|
||||||
| +------------+------------+---------------+
|
|
||||||
|
|
|
||||||
| +-------------+------------+----------------+
|
|
||||||
+-| LNXTHERM:00 | \_TZ_.TZ00 | acpi:LNXTHERM: |
|
|
||||||
+-------------+------------+----------------+
|
|
||||||
|
|
||||||
Figure 3. Example Linux ACPI Device Tree
|
|
||||||
|
|
||||||
NOTE: Each node is represented as "object/path/modalias", where:
|
|
||||||
1. 'object' is the name of the object's directory in sysfs.
|
|
||||||
2. 'path' is the ACPI namespace path of the corresponding
|
|
||||||
ACPI namespace object, as returned by the object's 'path'
|
|
||||||
sysfs attribute.
|
|
||||||
3. 'modalias' is the value of the object's 'modalias' sysfs
|
|
||||||
attribute (as described earlier in this document).
|
|
||||||
NOTE: N/A indicates the device object does not have the 'path' or the
|
|
||||||
'modalias' attribute.
|
|
||||||
@@ -1,187 +0,0 @@
|
|||||||
ACPI _OSI and _REV methods
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
An ACPI BIOS can use the "Operating System Interfaces" method (_OSI)
|
|
||||||
to find out what the operating system supports. Eg. If BIOS
|
|
||||||
AML code includes _OSI("XYZ"), the kernel's AML interpreter
|
|
||||||
can evaluate that method, look to see if it supports 'XYZ'
|
|
||||||
and answer YES or NO to the BIOS.
|
|
||||||
|
|
||||||
The ACPI _REV method returns the "Revision of the ACPI specification
|
|
||||||
that OSPM supports"
|
|
||||||
|
|
||||||
This document explains how and why the BIOS and Linux should use these methods.
|
|
||||||
It also explains how and why they are widely misused.
|
|
||||||
|
|
||||||
How to use _OSI
|
|
||||||
---------------
|
|
||||||
|
|
||||||
Linux runs on two groups of machines -- those that are tested by the OEM
|
|
||||||
to be compatible with Linux, and those that were never tested with Linux,
|
|
||||||
but where Linux was installed to replace the original OS (Windows or OSX).
|
|
||||||
|
|
||||||
The larger group is the systems tested to run only Windows. Not only that,
|
|
||||||
but many were tested to run with just one specific version of Windows.
|
|
||||||
So even though the BIOS may use _OSI to query what version of Windows is running,
|
|
||||||
only a single path through the BIOS has actually been tested.
|
|
||||||
Experience shows that taking untested paths through the BIOS
|
|
||||||
exposes Linux to an entire category of BIOS bugs.
|
|
||||||
For this reason, Linux _OSI defaults must continue to claim compatibility
|
|
||||||
with all versions of Windows.
|
|
||||||
|
|
||||||
But Linux isn't actually compatible with Windows, and the Linux community
|
|
||||||
has also been hurt with regressions when Linux adds the latest version of
|
|
||||||
Windows to its list of _OSI strings. So it is possible that additional strings
|
|
||||||
will be more thoroughly vetted before shipping upstream in the future.
|
|
||||||
But it is likely that they will all eventually be added.
|
|
||||||
|
|
||||||
What should an OEM do if they want to support Linux and Windows
|
|
||||||
using the same BIOS image? Often they need to do something different
|
|
||||||
for Linux to deal with how Linux is different from Windows.
|
|
||||||
Here the BIOS should ask exactly what it wants to know:
|
|
||||||
|
|
||||||
_OSI("Linux-OEM-my_interface_name")
|
|
||||||
where 'OEM' is needed if this is an OEM-specific hook,
|
|
||||||
and 'my_interface_name' describes the hook, which could be a
|
|
||||||
quirk, a bug, or a bug-fix.
|
|
||||||
|
|
||||||
In addition, the OEM should send a patch to upstream Linux
|
|
||||||
via the linux-acpi@vger.kernel.org mailing list. When that patch
|
|
||||||
is checked into Linux, the OS will answer "YES" when the BIOS
|
|
||||||
on the OEM's system uses _OSI to ask if the interface is supported
|
|
||||||
by the OS. Linux distributors can back-port that patch for Linux
|
|
||||||
pre-installs, and it will be included by all distributions that
|
|
||||||
re-base to upstream. If the distribution can not update the kernel binary,
|
|
||||||
they can also add an acpi_osi=Linux-OEM-my_interface_name
|
|
||||||
cmdline parameter to the boot loader, as needed.
|
|
||||||
|
|
||||||
If the string refers to a feature where the upstream kernel
|
|
||||||
eventually grows support, a patch should be sent to remove
|
|
||||||
the string when that support is added to the kernel.
|
|
||||||
|
|
||||||
That was easy. Read on, to find out how to do it wrong.
|
|
||||||
|
|
||||||
Before _OSI, there was _OS
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
ACPI 1.0 specified "_OS" as an
|
|
||||||
"object that evaluates to a string that identifies the operating system."
|
|
||||||
|
|
||||||
The ACPI BIOS flow would include an evaluation of _OS, and the AML
|
|
||||||
interpreter in the kernel would return to it a string identifying the OS:
|
|
||||||
|
|
||||||
Windows 98, SE: "Microsoft Windows"
|
|
||||||
Windows ME: "Microsoft WindowsME:Millenium Edition"
|
|
||||||
Windows NT: "Microsoft Windows NT"
|
|
||||||
|
|
||||||
The idea was on a platform tasked with running multiple OS's,
|
|
||||||
the BIOS could use _OS to enable devices that an OS
|
|
||||||
might support, or enable quirks or bug workarounds
|
|
||||||
necessary to make the platform compatible with that pre-existing OS.
|
|
||||||
|
|
||||||
But _OS had fundamental problems. First, the BIOS needed to know the name
|
|
||||||
of every possible version of the OS that would run on it, and needed to know
|
|
||||||
all the quirks of those OS's. Certainly it would make more sense
|
|
||||||
for the BIOS to ask *specific* things of the OS, such
|
|
||||||
"do you support a specific interface", and thus in ACPI 3.0,
|
|
||||||
_OSI was born to replace _OS.
|
|
||||||
|
|
||||||
_OS was abandoned, though even today, many BIOS look for
|
|
||||||
_OS "Microsoft Windows NT", though it seems somewhat far-fetched
|
|
||||||
that anybody would install those old operating systems
|
|
||||||
over what came with the machine.
|
|
||||||
|
|
||||||
Linux answers "Microsoft Windows NT" to please that BIOS idiom.
|
|
||||||
That is the *only* viable strategy, as that is what modern Windows does,
|
|
||||||
and so doing otherwise could steer the BIOS down an untested path.
|
|
||||||
|
|
||||||
_OSI is born, and immediately misused
|
|
||||||
--------------------------------------
|
|
||||||
|
|
||||||
With _OSI, the *BIOS* provides the string describing an interface,
|
|
||||||
and asks the OS: "YES/NO, are you compatible with this interface?"
|
|
||||||
|
|
||||||
eg. _OSI("3.0 Thermal Model") would return TRUE if the OS knows how
|
|
||||||
to deal with the thermal extensions made to the ACPI 3.0 specification.
|
|
||||||
An old OS that doesn't know about those extensions would answer FALSE,
|
|
||||||
and a new OS may be able to return TRUE.
|
|
||||||
|
|
||||||
For an OS-specific interface, the ACPI spec said that the BIOS and the OS
|
|
||||||
were to agree on a string of the form such as "Windows-interface_name".
|
|
||||||
|
|
||||||
But two bad things happened. First, the Windows ecosystem used _OSI
|
|
||||||
not as designed, but as a direct replacement for _OS -- identifying
|
|
||||||
the OS version, rather than an OS supported interface. Indeed, right
|
|
||||||
from the start, the ACPI 3.0 spec itself codified this misuse
|
|
||||||
in example code using _OSI("Windows 2001").
|
|
||||||
|
|
||||||
This misuse was adopted and continues today.
|
|
||||||
|
|
||||||
Linux had no choice but to also return TRUE to _OSI("Windows 2001")
|
|
||||||
and its successors. To do otherwise would virtually guarantee breaking
|
|
||||||
a BIOS that has been tested only with that _OSI returning TRUE.
|
|
||||||
|
|
||||||
This strategy is problematic, as Linux is never completely compatible with
|
|
||||||
the latest version of Windows, and sometimes it takes more than a year
|
|
||||||
to iron out incompatibilities.
|
|
||||||
|
|
||||||
Not to be out-done, the Linux community made things worse by returning TRUE
|
|
||||||
to _OSI("Linux"). Doing so is even worse than the Windows misuse
|
|
||||||
of _OSI, as "Linux" does not even contain any version information.
|
|
||||||
_OSI("Linux") led to some BIOS' malfunctioning due to BIOS writer's
|
|
||||||
using it in untested BIOS flows. But some OEM's used _OSI("Linux")
|
|
||||||
in tested flows to support real Linux features. In 2009, Linux
|
|
||||||
removed _OSI("Linux"), and added a cmdline parameter to restore it
|
|
||||||
for legacy systems still needed it. Further a BIOS_BUG warning prints
|
|
||||||
for all BIOS's that invoke it.
|
|
||||||
|
|
||||||
No BIOS should use _OSI("Linux").
|
|
||||||
|
|
||||||
The result is a strategy for Linux to maximize compatibility with
|
|
||||||
ACPI BIOS that are tested on Windows machines. There is a real risk
|
|
||||||
of over-stating that compatibility; but the alternative has often been
|
|
||||||
catastrophic failure resulting from the BIOS taking paths that
|
|
||||||
were never validated under *any* OS.
|
|
||||||
|
|
||||||
Do not use _REV
|
|
||||||
---------------
|
|
||||||
|
|
||||||
Since _OSI("Linux") went away, some BIOS writers used _REV
|
|
||||||
to support Linux and Windows differences in the same BIOS.
|
|
||||||
|
|
||||||
_REV was defined in ACPI 1.0 to return the version of ACPI
|
|
||||||
supported by the OS and the OS AML interpreter.
|
|
||||||
|
|
||||||
Modern Windows returns _REV = 2. Linux used ACPI_CA_SUPPORT_LEVEL,
|
|
||||||
which would increment, based on the version of the spec supported.
|
|
||||||
|
|
||||||
Unfortunately, _REV was also misused. eg. some BIOS would check
|
|
||||||
for _REV = 3, and do something for Linux, but when Linux returned
|
|
||||||
_REV = 4, that support broke.
|
|
||||||
|
|
||||||
In response to this problem, Linux returns _REV = 2 always,
|
|
||||||
from mid-2015 onward. The ACPI specification will also be updated
|
|
||||||
to reflect that _REV is deprecated, and always returns 2.
|
|
||||||
|
|
||||||
Apple Mac and _OSI("Darwin")
|
|
||||||
----------------------------
|
|
||||||
|
|
||||||
On Apple's Mac platforms, the ACPI BIOS invokes _OSI("Darwin")
|
|
||||||
to determine if the machine is running Apple OSX.
|
|
||||||
|
|
||||||
Like Linux's _OSI("*Windows*") strategy, Linux defaults to
|
|
||||||
answering YES to _OSI("Darwin") to enable full access
|
|
||||||
to the hardware and validated BIOS paths seen by OSX.
|
|
||||||
Just like on Windows-tested platforms, this strategy has risks.
|
|
||||||
|
|
||||||
Starting in Linux-3.18, the kernel answered YES to _OSI("Darwin")
|
|
||||||
for the purpose of enabling Mac Thunderbolt support. Further,
|
|
||||||
if the kernel noticed _OSI("Darwin") being invoked, it additionally
|
|
||||||
disabled all _OSI("*Windows*") to keep poorly written Mac BIOS
|
|
||||||
from going down untested combinations of paths.
|
|
||||||
|
|
||||||
The Linux-3.18 change in default caused power regressions on Mac
|
|
||||||
laptops, and the 3.18 implementation did not allow changing
|
|
||||||
the default via cmdline "acpi_osi=!Darwin". Linux-4.7 fixed
|
|
||||||
the ability to use acpi_osi=!Darwin as a workaround, and
|
|
||||||
we hope to see Mac Thunderbolt power management support in Linux-4.11.
|
|
||||||
@@ -1,77 +0,0 @@
|
|||||||
ACPI Scan Handlers
|
|
||||||
|
|
||||||
Copyright (C) 2012, Intel Corporation
|
|
||||||
Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
During system initialization and ACPI-based device hot-add, the ACPI namespace
|
|
||||||
is scanned in search of device objects that generally represent various pieces
|
|
||||||
of hardware. This causes a struct acpi_device object to be created and
|
|
||||||
registered with the driver core for every device object in the ACPI namespace
|
|
||||||
and the hierarchy of those struct acpi_device objects reflects the namespace
|
|
||||||
layout (i.e. parent device objects in the namespace are represented by parent
|
|
||||||
struct acpi_device objects and analogously for their children). Those struct
|
|
||||||
acpi_device objects are referred to as "device nodes" in what follows, but they
|
|
||||||
should not be confused with struct device_node objects used by the Device Trees
|
|
||||||
parsing code (although their role is analogous to the role of those objects).
|
|
||||||
|
|
||||||
During ACPI-based device hot-remove device nodes representing pieces of hardware
|
|
||||||
being removed are unregistered and deleted.
|
|
||||||
|
|
||||||
The core ACPI namespace scanning code in drivers/acpi/scan.c carries out basic
|
|
||||||
initialization of device nodes, such as retrieving common configuration
|
|
||||||
information from the device objects represented by them and populating them with
|
|
||||||
appropriate data, but some of them require additional handling after they have
|
|
||||||
been registered. For example, if the given device node represents a PCI host
|
|
||||||
bridge, its registration should cause the PCI bus under that bridge to be
|
|
||||||
enumerated and PCI devices on that bus to be registered with the driver core.
|
|
||||||
Similarly, if the device node represents a PCI interrupt link, it is necessary
|
|
||||||
to configure that link so that the kernel can use it.
|
|
||||||
|
|
||||||
Those additional configuration tasks usually depend on the type of the hardware
|
|
||||||
component represented by the given device node which can be determined on the
|
|
||||||
basis of the device node's hardware ID (HID). They are performed by objects
|
|
||||||
called ACPI scan handlers represented by the following structure:
|
|
||||||
|
|
||||||
struct acpi_scan_handler {
|
|
||||||
const struct acpi_device_id *ids;
|
|
||||||
struct list_head list_node;
|
|
||||||
int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id);
|
|
||||||
void (*detach)(struct acpi_device *dev);
|
|
||||||
};
|
|
||||||
|
|
||||||
where ids is the list of IDs of device nodes the given handler is supposed to
|
|
||||||
take care of, list_node is the hook to the global list of ACPI scan handlers
|
|
||||||
maintained by the ACPI core and the .attach() and .detach() callbacks are
|
|
||||||
executed, respectively, after registration of new device nodes and before
|
|
||||||
unregistration of device nodes the handler attached to previously.
|
|
||||||
|
|
||||||
The namespace scanning function, acpi_bus_scan(), first registers all of the
|
|
||||||
device nodes in the given namespace scope with the driver core. Then, it tries
|
|
||||||
to match a scan handler against each of them using the ids arrays of the
|
|
||||||
available scan handlers. If a matching scan handler is found, its .attach()
|
|
||||||
callback is executed for the given device node. If that callback returns 1,
|
|
||||||
that means that the handler has claimed the device node and is now responsible
|
|
||||||
for carrying out any additional configuration tasks related to it. It also will
|
|
||||||
be responsible for preparing the device node for unregistration in that case.
|
|
||||||
The device node's handler field is then populated with the address of the scan
|
|
||||||
handler that has claimed it.
|
|
||||||
|
|
||||||
If the .attach() callback returns 0, it means that the device node is not
|
|
||||||
interesting to the given scan handler and may be matched against the next scan
|
|
||||||
handler in the list. If it returns a (negative) error code, that means that
|
|
||||||
the namespace scan should be terminated due to a serious error. The error code
|
|
||||||
returned should then reflect the type of the error.
|
|
||||||
|
|
||||||
The namespace trimming function, acpi_bus_trim(), first executes .detach()
|
|
||||||
callbacks from the scan handlers of all device nodes in the given namespace
|
|
||||||
scope (if they have scan handlers). Next, it unregisters all of the device
|
|
||||||
nodes in that scope.
|
|
||||||
|
|
||||||
ACPI scan handlers can be added to the list maintained by the ACPI core with the
|
|
||||||
help of the acpi_scan_add_handler() function taking a pointer to the new scan
|
|
||||||
handler as an argument. The order in which scan handlers are added to the list
|
|
||||||
is the order in which they are matched against device nodes during namespace
|
|
||||||
scans.
|
|
||||||
|
|
||||||
All scan handles must be added to the list before acpi_bus_scan() is run for the
|
|
||||||
first time and they cannot be removed from it.
|
|
||||||
@@ -1,172 +0,0 @@
|
|||||||
|
|
||||||
In order to support ACPI open-ended hardware configurations (e.g. development
|
|
||||||
boards) we need a way to augment the ACPI configuration provided by the firmware
|
|
||||||
image. A common example is connecting sensors on I2C / SPI buses on development
|
|
||||||
boards.
|
|
||||||
|
|
||||||
Although this can be accomplished by creating a kernel platform driver or
|
|
||||||
recompiling the firmware image with updated ACPI tables, neither is practical:
|
|
||||||
the former proliferates board specific kernel code while the latter requires
|
|
||||||
access to firmware tools which are often not publicly available.
|
|
||||||
|
|
||||||
Because ACPI supports external references in AML code a more practical
|
|
||||||
way to augment firmware ACPI configuration is by dynamically loading
|
|
||||||
user defined SSDT tables that contain the board specific information.
|
|
||||||
|
|
||||||
For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the
|
|
||||||
Minnowboard MAX development board exposed via the LSE connector [1], the
|
|
||||||
following ASL code can be used:
|
|
||||||
|
|
||||||
DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003)
|
|
||||||
{
|
|
||||||
External (\_SB.I2C6, DeviceObj)
|
|
||||||
|
|
||||||
Scope (\_SB.I2C6)
|
|
||||||
{
|
|
||||||
Device (STAC)
|
|
||||||
{
|
|
||||||
Name (_ADR, Zero)
|
|
||||||
Name (_HID, "BMA222E")
|
|
||||||
|
|
||||||
Method (_CRS, 0, Serialized)
|
|
||||||
{
|
|
||||||
Name (RBUF, ResourceTemplate ()
|
|
||||||
{
|
|
||||||
I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80,
|
|
||||||
AddressingMode7Bit, "\\_SB.I2C6", 0x00,
|
|
||||||
ResourceConsumer, ,)
|
|
||||||
GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000,
|
|
||||||
"\\_SB.GPO2", 0x00, ResourceConsumer, , )
|
|
||||||
{ // Pin list
|
|
||||||
0
|
|
||||||
}
|
|
||||||
})
|
|
||||||
Return (RBUF)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
which can then be compiled to AML binary format:
|
|
||||||
|
|
||||||
$ iasl minnowmax.asl
|
|
||||||
|
|
||||||
Intel ACPI Component Architecture
|
|
||||||
ASL Optimizing Compiler version 20140214-64 [Mar 29 2014]
|
|
||||||
Copyright (c) 2000 - 2014 Intel Corporation
|
|
||||||
|
|
||||||
ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords
|
|
||||||
AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes
|
|
||||||
|
|
||||||
[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29
|
|
||||||
|
|
||||||
The resulting AML code can then be loaded by the kernel using one of the methods
|
|
||||||
below.
|
|
||||||
|
|
||||||
== Loading ACPI SSDTs from initrd ==
|
|
||||||
|
|
||||||
This option allows loading of user defined SSDTs from initrd and it is useful
|
|
||||||
when the system does not support EFI or when there is not enough EFI storage.
|
|
||||||
|
|
||||||
It works in a similar way with initrd based ACPI tables override/upgrade: SSDT
|
|
||||||
aml code must be placed in the first, uncompressed, initrd under the
|
|
||||||
"kernel/firmware/acpi" path. Multiple files can be used and this will translate
|
|
||||||
in loading multiple tables. Only SSDT and OEM tables are allowed. See
|
|
||||||
initrd_table_override.txt for more details.
|
|
||||||
|
|
||||||
Here is an example:
|
|
||||||
|
|
||||||
# Add the raw ACPI tables to an uncompressed cpio archive.
|
|
||||||
# They must be put into a /kernel/firmware/acpi directory inside the
|
|
||||||
# cpio archive.
|
|
||||||
# The uncompressed cpio archive must be the first.
|
|
||||||
# Other, typically compressed cpio archives, must be
|
|
||||||
# concatenated on top of the uncompressed one.
|
|
||||||
mkdir -p kernel/firmware/acpi
|
|
||||||
cp ssdt.aml kernel/firmware/acpi
|
|
||||||
|
|
||||||
# Create the uncompressed cpio archive and concatenate the original initrd
|
|
||||||
# on top:
|
|
||||||
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
|
||||||
cat /boot/initrd >>/boot/instrumented_initrd
|
|
||||||
|
|
||||||
== Loading ACPI SSDTs from EFI variables ==
|
|
||||||
|
|
||||||
This is the preferred method, when EFI is supported on the platform, because it
|
|
||||||
allows a persistent, OS independent way of storing the user defined SSDTs. There
|
|
||||||
is also work underway to implement EFI support for loading user defined SSDTs
|
|
||||||
and using this method will make it easier to convert to the EFI loading
|
|
||||||
mechanism when that will arrive.
|
|
||||||
|
|
||||||
In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line
|
|
||||||
parameter can be used. The argument for the option is the variable name to
|
|
||||||
use. If there are multiple variables with the same name but with different
|
|
||||||
vendor GUIDs, all of them will be loaded.
|
|
||||||
|
|
||||||
In order to store the AML code in an EFI variable the efivarfs filesystem can be
|
|
||||||
used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all
|
|
||||||
recent distribution.
|
|
||||||
|
|
||||||
Creating a new file in /sys/firmware/efi/efivars will automatically create a new
|
|
||||||
EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI
|
|
||||||
variable. Please note that the file name needs to be specially formatted as
|
|
||||||
"Name-GUID" and that the first 4 bytes in the file (little-endian format)
|
|
||||||
represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in
|
|
||||||
include/linux/efi.h). Writing to the file must also be done with one write
|
|
||||||
operation.
|
|
||||||
|
|
||||||
For example, you can use the following bash script to create/update an EFI
|
|
||||||
variable with the content from a given file:
|
|
||||||
|
|
||||||
#!/bin/sh -e
|
|
||||||
|
|
||||||
while ! [ -z "$1" ]; do
|
|
||||||
case "$1" in
|
|
||||||
"-f") filename="$2"; shift;;
|
|
||||||
"-g") guid="$2"; shift;;
|
|
||||||
*) name="$1";;
|
|
||||||
esac
|
|
||||||
shift
|
|
||||||
done
|
|
||||||
|
|
||||||
usage()
|
|
||||||
{
|
|
||||||
echo "Syntax: ${0##*/} -f filename [ -g guid ] name"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
[ -n "$name" -a -f "$filename" ] || usage
|
|
||||||
|
|
||||||
EFIVARFS="/sys/firmware/efi/efivars"
|
|
||||||
|
|
||||||
[ -d "$EFIVARFS" ] || exit 2
|
|
||||||
|
|
||||||
if stat -tf $EFIVARFS | grep -q -v de5e81e4; then
|
|
||||||
mount -t efivarfs none $EFIVARFS
|
|
||||||
fi
|
|
||||||
|
|
||||||
# try to pick up an existing GUID
|
|
||||||
[ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-)
|
|
||||||
|
|
||||||
# use a randomly generated GUID
|
|
||||||
[ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)"
|
|
||||||
|
|
||||||
# efivarfs expects all of the data in one write
|
|
||||||
tmp=$(mktemp)
|
|
||||||
/bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp
|
|
||||||
dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp)
|
|
||||||
rm $tmp
|
|
||||||
|
|
||||||
== Loading ACPI SSDTs from configfs ==
|
|
||||||
|
|
||||||
This option allows loading of user defined SSDTs from userspace via the configfs
|
|
||||||
interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be
|
|
||||||
mounted. In the following examples, we assume that configfs has been mounted in
|
|
||||||
/config.
|
|
||||||
|
|
||||||
New tables can be loading by creating new directories in /config/acpi/table/ and
|
|
||||||
writing the SSDT aml code in the aml attribute:
|
|
||||||
|
|
||||||
cd /config/acpi/table
|
|
||||||
mkdir my_ssdt
|
|
||||||
cat ~/ssdt.aml > my_ssdt/aml
|
|
||||||
@@ -1,106 +0,0 @@
|
|||||||
ACPI video extensions
|
|
||||||
~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
This driver implement the ACPI Extensions For Display Adapters for
|
|
||||||
integrated graphics devices on motherboard, as specified in ACPI 2.0
|
|
||||||
Specification, Appendix B, allowing to perform some basic control like
|
|
||||||
defining the video POST device, retrieving EDID information or to
|
|
||||||
setup a video output, etc. Note that this is an ref. implementation
|
|
||||||
only. It may or may not work for your integrated video device.
|
|
||||||
|
|
||||||
The ACPI video driver does 3 things regarding backlight control:
|
|
||||||
|
|
||||||
1 Export a sysfs interface for user space to control backlight level
|
|
||||||
|
|
||||||
If the ACPI table has a video device, and acpi_backlight=vendor kernel
|
|
||||||
command line is not present, the driver will register a backlight device
|
|
||||||
and set the required backlight operation structure for it for the sysfs
|
|
||||||
interface control. For every registered class device, there will be a
|
|
||||||
directory named acpi_videoX under /sys/class/backlight.
|
|
||||||
|
|
||||||
The backlight sysfs interface has a standard definition here:
|
|
||||||
Documentation/ABI/stable/sysfs-class-backlight.
|
|
||||||
|
|
||||||
And what ACPI video driver does is:
|
|
||||||
actual_brightness: on read, control method _BQC will be evaluated to
|
|
||||||
get the brightness level the firmware thinks it is at;
|
|
||||||
bl_power: not implemented, will set the current brightness instead;
|
|
||||||
brightness: on write, control method _BCM will run to set the requested
|
|
||||||
brightness level;
|
|
||||||
max_brightness: Derived from the _BCL package(see below);
|
|
||||||
type: firmware
|
|
||||||
|
|
||||||
Note that ACPI video backlight driver will always use index for
|
|
||||||
brightness, actual_brightness and max_brightness. So if we have
|
|
||||||
the following _BCL package:
|
|
||||||
|
|
||||||
Method (_BCL, 0, NotSerialized)
|
|
||||||
{
|
|
||||||
Return (Package (0x0C)
|
|
||||||
{
|
|
||||||
0x64,
|
|
||||||
0x32,
|
|
||||||
0x0A,
|
|
||||||
0x14,
|
|
||||||
0x1E,
|
|
||||||
0x28,
|
|
||||||
0x32,
|
|
||||||
0x3C,
|
|
||||||
0x46,
|
|
||||||
0x50,
|
|
||||||
0x5A,
|
|
||||||
0x64
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
The first two levels are for when laptop are on AC or on battery and are
|
|
||||||
not used by Linux currently. The remaining 10 levels are supported levels
|
|
||||||
that we can choose from. The applicable index values are from 0 (that
|
|
||||||
corresponds to the 0x0A brightness value) to 9 (that corresponds to the
|
|
||||||
0x64 brightness value) inclusive. Each of those index values is regarded
|
|
||||||
as a "brightness level" indicator. Thus from the user space perspective
|
|
||||||
the range of available brightness levels is from 0 to 9 (max_brightness)
|
|
||||||
inclusive.
|
|
||||||
|
|
||||||
2 Notify user space about hotkey event
|
|
||||||
|
|
||||||
There are generally two cases for hotkey event reporting:
|
|
||||||
i) For some laptops, when user presses the hotkey, a scancode will be
|
|
||||||
generated and sent to user space through the input device created by
|
|
||||||
the keyboard driver as a key type input event, with proper remap, the
|
|
||||||
following key code will appear to user space:
|
|
||||||
|
|
||||||
EV_KEY, KEY_BRIGHTNESSUP
|
|
||||||
EV_KEY, KEY_BRIGHTNESSDOWN
|
|
||||||
etc.
|
|
||||||
|
|
||||||
For this case, ACPI video driver does not need to do anything(actually,
|
|
||||||
it doesn't even know this happened).
|
|
||||||
|
|
||||||
ii) For some laptops, the press of the hotkey will not generate the
|
|
||||||
scancode, instead, firmware will notify the video device ACPI node
|
|
||||||
about the event. The event value is defined in the ACPI spec. ACPI
|
|
||||||
video driver will generate an key type input event according to the
|
|
||||||
notify value it received and send the event to user space through the
|
|
||||||
input device it created:
|
|
||||||
|
|
||||||
event keycode
|
|
||||||
0x86 KEY_BRIGHTNESSUP
|
|
||||||
0x87 KEY_BRIGHTNESSDOWN
|
|
||||||
etc.
|
|
||||||
|
|
||||||
so this would lead to the same effect as case i) now.
|
|
||||||
|
|
||||||
Once user space tool receives this event, it can modify the backlight
|
|
||||||
level through the sysfs interface.
|
|
||||||
|
|
||||||
3 Change backlight level in the kernel
|
|
||||||
|
|
||||||
This works for machines covered by case ii) in Section 2. Once the driver
|
|
||||||
received a notification, it will set the backlight level accordingly. This does
|
|
||||||
not affect the sending of event to user space, they are always sent to user
|
|
||||||
space regardless of whether or not the video module controls the backlight level
|
|
||||||
directly. This behaviour can be controlled through the brightness_switch_enabled
|
|
||||||
module parameter as documented in admin-guide/kernel-parameters.rst. It is recommended to
|
|
||||||
disable this behaviour once a GUI environment starts up and wants to have full
|
|
||||||
control of the backlight level.
|
|
||||||
76
Documentation/admin-guide/acpi/cppc_sysfs.rst
Normal file
76
Documentation/admin-guide/acpi/cppc_sysfs.rst
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
==================================================
|
||||||
|
Collaborative Processor Performance Control (CPPC)
|
||||||
|
==================================================
|
||||||
|
|
||||||
|
CPPC
|
||||||
|
====
|
||||||
|
|
||||||
|
CPPC defined in the ACPI spec describes a mechanism for the OS to manage the
|
||||||
|
performance of a logical processor on a contigious and abstract performance
|
||||||
|
scale. CPPC exposes a set of registers to describe abstract performance scale,
|
||||||
|
to request performance levels and to measure per-cpu delivered performance.
|
||||||
|
|
||||||
|
For more details on CPPC please refer to the ACPI specification at:
|
||||||
|
|
||||||
|
http://uefi.org/specifications
|
||||||
|
|
||||||
|
Some of the CPPC registers are exposed via sysfs under::
|
||||||
|
|
||||||
|
/sys/devices/system/cpu/cpuX/acpi_cppc/
|
||||||
|
|
||||||
|
for each cpu X::
|
||||||
|
|
||||||
|
$ ls -lR /sys/devices/system/cpu/cpu0/acpi_cppc/
|
||||||
|
/sys/devices/system/cpu/cpu0/acpi_cppc/:
|
||||||
|
total 0
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 feedback_ctrs
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 highest_perf
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_freq
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_nonlinear_perf
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 lowest_perf
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_freq
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 nominal_perf
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 reference_perf
|
||||||
|
-r--r--r-- 1 root root 65536 Mar 5 19:38 wraparound_time
|
||||||
|
|
||||||
|
* highest_perf : Highest performance of this processor (abstract scale).
|
||||||
|
* nominal_perf : Highest sustained performance of this processor
|
||||||
|
(abstract scale).
|
||||||
|
* lowest_nonlinear_perf : Lowest performance of this processor with nonlinear
|
||||||
|
power savings (abstract scale).
|
||||||
|
* lowest_perf : Lowest performance of this processor (abstract scale).
|
||||||
|
|
||||||
|
* lowest_freq : CPU frequency corresponding to lowest_perf (in MHz).
|
||||||
|
* nominal_freq : CPU frequency corresponding to nominal_perf (in MHz).
|
||||||
|
The above frequencies should only be used to report processor performance in
|
||||||
|
freqency instead of abstract scale. These values should not be used for any
|
||||||
|
functional decisions.
|
||||||
|
|
||||||
|
* feedback_ctrs : Includes both Reference and delivered performance counter.
|
||||||
|
Reference counter ticks up proportional to processor's reference performance.
|
||||||
|
Delivered counter ticks up proportional to processor's delivered performance.
|
||||||
|
* wraparound_time: Minimum time for the feedback counters to wraparound
|
||||||
|
(seconds).
|
||||||
|
* reference_perf : Performance level at which reference performance counter
|
||||||
|
accumulates (abstract scale).
|
||||||
|
|
||||||
|
|
||||||
|
Computing Average Delivered Performance
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
Below describes the steps to compute the average performance delivered by
|
||||||
|
taking two different snapshots of feedback counters at time T1 and T2.
|
||||||
|
|
||||||
|
T1: Read feedback_ctrs as fbc_t1
|
||||||
|
Wait or run some workload
|
||||||
|
|
||||||
|
T2: Read feedback_ctrs as fbc_t2
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
delivered_counter_delta = fbc_t2[del] - fbc_t1[del]
|
||||||
|
reference_counter_delta = fbc_t2[ref] - fbc_t1[ref]
|
||||||
|
|
||||||
|
delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta
|
||||||
13
Documentation/admin-guide/acpi/dsdt-override.rst
Normal file
13
Documentation/admin-guide/acpi/dsdt-override.rst
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===============
|
||||||
|
Overriding DSDT
|
||||||
|
===============
|
||||||
|
|
||||||
|
Linux supports a method of overriding the BIOS DSDT:
|
||||||
|
|
||||||
|
CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel.
|
||||||
|
|
||||||
|
When to use this method is described in detail on the
|
||||||
|
Linux/ACPI home page:
|
||||||
|
https://01.org/linux-acpi/documentation/overriding-dsdt
|
||||||
14
Documentation/admin-guide/acpi/index.rst
Normal file
14
Documentation/admin-guide/acpi/index.rst
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
============
|
||||||
|
ACPI Support
|
||||||
|
============
|
||||||
|
|
||||||
|
Here we document in detail how to interact with various mechanisms in
|
||||||
|
the Linux ACPI support.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
initrd_table_override
|
||||||
|
dsdt-override
|
||||||
|
ssdt-overlays
|
||||||
|
cppc_sysfs
|
||||||
115
Documentation/admin-guide/acpi/initrd_table_override.rst
Normal file
115
Documentation/admin-guide/acpi/initrd_table_override.rst
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
================================
|
||||||
|
Upgrading ACPI tables via initrd
|
||||||
|
================================
|
||||||
|
|
||||||
|
What is this about
|
||||||
|
==================
|
||||||
|
|
||||||
|
If the ACPI_TABLE_UPGRADE compile option is true, it is possible to
|
||||||
|
upgrade the ACPI execution environment that is defined by the ACPI tables
|
||||||
|
via upgrading the ACPI tables provided by the BIOS with an instrumented,
|
||||||
|
modified, more recent version one, or installing brand new ACPI tables.
|
||||||
|
|
||||||
|
When building initrd with kernel in a single image, option
|
||||||
|
ACPI_TABLE_OVERRIDE_VIA_BUILTIN_INITRD should also be true for this
|
||||||
|
feature to work.
|
||||||
|
|
||||||
|
For a full list of ACPI tables that can be upgraded/installed, take a look
|
||||||
|
at the char `*table_sigs[MAX_ACPI_SIGNATURE];` definition in
|
||||||
|
drivers/acpi/tables.c.
|
||||||
|
|
||||||
|
All ACPI tables iasl (Intel's ACPI compiler and disassembler) knows should
|
||||||
|
be overridable, except:
|
||||||
|
|
||||||
|
- ACPI_SIG_RSDP (has a signature of 6 bytes)
|
||||||
|
- ACPI_SIG_FACS (does not have an ordinary ACPI table header)
|
||||||
|
|
||||||
|
Both could get implemented as well.
|
||||||
|
|
||||||
|
|
||||||
|
What is this for
|
||||||
|
================
|
||||||
|
|
||||||
|
Complain to your platform/BIOS vendor if you find a bug which is so severe
|
||||||
|
that a workaround is not accepted in the Linux kernel. And this facility
|
||||||
|
allows you to upgrade the buggy tables before your platform/BIOS vendor
|
||||||
|
releases an upgraded BIOS binary.
|
||||||
|
|
||||||
|
This facility can be used by platform/BIOS vendors to provide a Linux
|
||||||
|
compatible environment without modifying the underlying platform firmware.
|
||||||
|
|
||||||
|
This facility also provides a powerful feature to easily debug and test
|
||||||
|
ACPI BIOS table compatibility with the Linux kernel by modifying old
|
||||||
|
platform provided ACPI tables or inserting new ACPI tables.
|
||||||
|
|
||||||
|
It can and should be enabled in any kernel because there is no functional
|
||||||
|
change with not instrumented initrds.
|
||||||
|
|
||||||
|
|
||||||
|
How does it work
|
||||||
|
================
|
||||||
|
::
|
||||||
|
|
||||||
|
# Extract the machine's ACPI tables:
|
||||||
|
cd /tmp
|
||||||
|
acpidump >acpidump
|
||||||
|
acpixtract -a acpidump
|
||||||
|
# Disassemble, modify and recompile them:
|
||||||
|
iasl -d *.dat
|
||||||
|
# For example add this statement into a _PRT (PCI Routing Table) function
|
||||||
|
# of the DSDT:
|
||||||
|
Store("HELLO WORLD", debug)
|
||||||
|
# And increase the OEM Revision. For example, before modification:
|
||||||
|
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000000)
|
||||||
|
# After modification:
|
||||||
|
DefinitionBlock ("DSDT.aml", "DSDT", 2, "INTEL ", "TEMPLATE", 0x00000001)
|
||||||
|
iasl -sa dsdt.dsl
|
||||||
|
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||||
|
# They must be put into a /kernel/firmware/acpi directory inside the cpio
|
||||||
|
# archive. Note that if the table put here matches a platform table
|
||||||
|
# (similar Table Signature, and similar OEMID, and similar OEM Table ID)
|
||||||
|
# with a more recent OEM Revision, the platform table will be upgraded by
|
||||||
|
# this table. If the table put here doesn't match a platform table
|
||||||
|
# (dissimilar Table Signature, or dissimilar OEMID, or dissimilar OEM Table
|
||||||
|
# ID), this table will be appended.
|
||||||
|
mkdir -p kernel/firmware/acpi
|
||||||
|
cp dsdt.aml kernel/firmware/acpi
|
||||||
|
# A maximum of "NR_ACPI_INITRD_TABLES (64)" tables are currently allowed
|
||||||
|
# (see osl.c):
|
||||||
|
iasl -sa facp.dsl
|
||||||
|
iasl -sa ssdt1.dsl
|
||||||
|
cp facp.aml kernel/firmware/acpi
|
||||||
|
cp ssdt1.aml kernel/firmware/acpi
|
||||||
|
# The uncompressed cpio archive must be the first. Other, typically
|
||||||
|
# compressed cpio archives, must be concatenated on top of the uncompressed
|
||||||
|
# one. Following command creates the uncompressed cpio archive and
|
||||||
|
# concatenates the original initrd on top:
|
||||||
|
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||||
|
cat /boot/initrd >>/boot/instrumented_initrd
|
||||||
|
# reboot with increased acpi debug level, e.g. boot params:
|
||||||
|
acpi.debug_level=0x2 acpi.debug_layer=0xFFFFFFFF
|
||||||
|
# and check your syslog:
|
||||||
|
[ 1.268089] ACPI: PCI Interrupt Routing Table [\_SB_.PCI0._PRT]
|
||||||
|
[ 1.272091] [ACPI Debug] String [0x0B] "HELLO WORLD"
|
||||||
|
|
||||||
|
iasl is able to disassemble and recompile quite a lot different,
|
||||||
|
also static ACPI tables.
|
||||||
|
|
||||||
|
|
||||||
|
Where to retrieve userspace tools
|
||||||
|
=================================
|
||||||
|
|
||||||
|
iasl and acpixtract are part of Intel's ACPICA project:
|
||||||
|
http://acpica.org/
|
||||||
|
|
||||||
|
and should be packaged by distributions (for example in the acpica package
|
||||||
|
on SUSE).
|
||||||
|
|
||||||
|
acpidump can be found in Len Browns pmtools:
|
||||||
|
ftp://kernel.org/pub/linux/kernel/people/lenb/acpi/utils/pmtools/acpidump
|
||||||
|
|
||||||
|
This tool is also part of the acpica package on SUSE.
|
||||||
|
Alternatively, used ACPI tables can be retrieved via sysfs in latest kernels:
|
||||||
|
/sys/firmware/acpi/tables
|
||||||
180
Documentation/admin-guide/acpi/ssdt-overlays.rst
Normal file
180
Documentation/admin-guide/acpi/ssdt-overlays.rst
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=============
|
||||||
|
SSDT Overlays
|
||||||
|
=============
|
||||||
|
|
||||||
|
In order to support ACPI open-ended hardware configurations (e.g. development
|
||||||
|
boards) we need a way to augment the ACPI configuration provided by the firmware
|
||||||
|
image. A common example is connecting sensors on I2C / SPI buses on development
|
||||||
|
boards.
|
||||||
|
|
||||||
|
Although this can be accomplished by creating a kernel platform driver or
|
||||||
|
recompiling the firmware image with updated ACPI tables, neither is practical:
|
||||||
|
the former proliferates board specific kernel code while the latter requires
|
||||||
|
access to firmware tools which are often not publicly available.
|
||||||
|
|
||||||
|
Because ACPI supports external references in AML code a more practical
|
||||||
|
way to augment firmware ACPI configuration is by dynamically loading
|
||||||
|
user defined SSDT tables that contain the board specific information.
|
||||||
|
|
||||||
|
For example, to enumerate a Bosch BMA222E accelerometer on the I2C bus of the
|
||||||
|
Minnowboard MAX development board exposed via the LSE connector [1], the
|
||||||
|
following ASL code can be used::
|
||||||
|
|
||||||
|
DefinitionBlock ("minnowmax.aml", "SSDT", 1, "Vendor", "Accel", 0x00000003)
|
||||||
|
{
|
||||||
|
External (\_SB.I2C6, DeviceObj)
|
||||||
|
|
||||||
|
Scope (\_SB.I2C6)
|
||||||
|
{
|
||||||
|
Device (STAC)
|
||||||
|
{
|
||||||
|
Name (_ADR, Zero)
|
||||||
|
Name (_HID, "BMA222E")
|
||||||
|
|
||||||
|
Method (_CRS, 0, Serialized)
|
||||||
|
{
|
||||||
|
Name (RBUF, ResourceTemplate ()
|
||||||
|
{
|
||||||
|
I2cSerialBus (0x0018, ControllerInitiated, 0x00061A80,
|
||||||
|
AddressingMode7Bit, "\\_SB.I2C6", 0x00,
|
||||||
|
ResourceConsumer, ,)
|
||||||
|
GpioInt (Edge, ActiveHigh, Exclusive, PullDown, 0x0000,
|
||||||
|
"\\_SB.GPO2", 0x00, ResourceConsumer, , )
|
||||||
|
{ // Pin list
|
||||||
|
0
|
||||||
|
}
|
||||||
|
})
|
||||||
|
Return (RBUF)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
which can then be compiled to AML binary format::
|
||||||
|
|
||||||
|
$ iasl minnowmax.asl
|
||||||
|
|
||||||
|
Intel ACPI Component Architecture
|
||||||
|
ASL Optimizing Compiler version 20140214-64 [Mar 29 2014]
|
||||||
|
Copyright (c) 2000 - 2014 Intel Corporation
|
||||||
|
|
||||||
|
ASL Input: minnomax.asl - 30 lines, 614 bytes, 7 keywords
|
||||||
|
AML Output: minnowmax.aml - 165 bytes, 6 named objects, 1 executable opcodes
|
||||||
|
|
||||||
|
[1] http://wiki.minnowboard.org/MinnowBoard_MAX#Low_Speed_Expansion_Connector_.28Top.29
|
||||||
|
|
||||||
|
The resulting AML code can then be loaded by the kernel using one of the methods
|
||||||
|
below.
|
||||||
|
|
||||||
|
Loading ACPI SSDTs from initrd
|
||||||
|
==============================
|
||||||
|
|
||||||
|
This option allows loading of user defined SSDTs from initrd and it is useful
|
||||||
|
when the system does not support EFI or when there is not enough EFI storage.
|
||||||
|
|
||||||
|
It works in a similar way with initrd based ACPI tables override/upgrade: SSDT
|
||||||
|
aml code must be placed in the first, uncompressed, initrd under the
|
||||||
|
"kernel/firmware/acpi" path. Multiple files can be used and this will translate
|
||||||
|
in loading multiple tables. Only SSDT and OEM tables are allowed. See
|
||||||
|
initrd_table_override.txt for more details.
|
||||||
|
|
||||||
|
Here is an example::
|
||||||
|
|
||||||
|
# Add the raw ACPI tables to an uncompressed cpio archive.
|
||||||
|
# They must be put into a /kernel/firmware/acpi directory inside the
|
||||||
|
# cpio archive.
|
||||||
|
# The uncompressed cpio archive must be the first.
|
||||||
|
# Other, typically compressed cpio archives, must be
|
||||||
|
# concatenated on top of the uncompressed one.
|
||||||
|
mkdir -p kernel/firmware/acpi
|
||||||
|
cp ssdt.aml kernel/firmware/acpi
|
||||||
|
|
||||||
|
# Create the uncompressed cpio archive and concatenate the original initrd
|
||||||
|
# on top:
|
||||||
|
find kernel | cpio -H newc --create > /boot/instrumented_initrd
|
||||||
|
cat /boot/initrd >>/boot/instrumented_initrd
|
||||||
|
|
||||||
|
Loading ACPI SSDTs from EFI variables
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
This is the preferred method, when EFI is supported on the platform, because it
|
||||||
|
allows a persistent, OS independent way of storing the user defined SSDTs. There
|
||||||
|
is also work underway to implement EFI support for loading user defined SSDTs
|
||||||
|
and using this method will make it easier to convert to the EFI loading
|
||||||
|
mechanism when that will arrive.
|
||||||
|
|
||||||
|
In order to load SSDTs from an EFI variable the efivar_ssdt kernel command line
|
||||||
|
parameter can be used. The argument for the option is the variable name to
|
||||||
|
use. If there are multiple variables with the same name but with different
|
||||||
|
vendor GUIDs, all of them will be loaded.
|
||||||
|
|
||||||
|
In order to store the AML code in an EFI variable the efivarfs filesystem can be
|
||||||
|
used. It is enabled and mounted by default in /sys/firmware/efi/efivars in all
|
||||||
|
recent distribution.
|
||||||
|
|
||||||
|
Creating a new file in /sys/firmware/efi/efivars will automatically create a new
|
||||||
|
EFI variable. Updating a file in /sys/firmware/efi/efivars will update the EFI
|
||||||
|
variable. Please note that the file name needs to be specially formatted as
|
||||||
|
"Name-GUID" and that the first 4 bytes in the file (little-endian format)
|
||||||
|
represent the attributes of the EFI variable (see EFI_VARIABLE_MASK in
|
||||||
|
include/linux/efi.h). Writing to the file must also be done with one write
|
||||||
|
operation.
|
||||||
|
|
||||||
|
For example, you can use the following bash script to create/update an EFI
|
||||||
|
variable with the content from a given file::
|
||||||
|
|
||||||
|
#!/bin/sh -e
|
||||||
|
|
||||||
|
while ! [ -z "$1" ]; do
|
||||||
|
case "$1" in
|
||||||
|
"-f") filename="$2"; shift;;
|
||||||
|
"-g") guid="$2"; shift;;
|
||||||
|
*) name="$1";;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
|
||||||
|
usage()
|
||||||
|
{
|
||||||
|
echo "Syntax: ${0##*/} -f filename [ -g guid ] name"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
[ -n "$name" -a -f "$filename" ] || usage
|
||||||
|
|
||||||
|
EFIVARFS="/sys/firmware/efi/efivars"
|
||||||
|
|
||||||
|
[ -d "$EFIVARFS" ] || exit 2
|
||||||
|
|
||||||
|
if stat -tf $EFIVARFS | grep -q -v de5e81e4; then
|
||||||
|
mount -t efivarfs none $EFIVARFS
|
||||||
|
fi
|
||||||
|
|
||||||
|
# try to pick up an existing GUID
|
||||||
|
[ -n "$guid" ] || guid=$(find "$EFIVARFS" -name "$name-*" | head -n1 | cut -f2- -d-)
|
||||||
|
|
||||||
|
# use a randomly generated GUID
|
||||||
|
[ -n "$guid" ] || guid="$(cat /proc/sys/kernel/random/uuid)"
|
||||||
|
|
||||||
|
# efivarfs expects all of the data in one write
|
||||||
|
tmp=$(mktemp)
|
||||||
|
/bin/echo -ne "\007\000\000\000" | cat - $filename > $tmp
|
||||||
|
dd if=$tmp of="$EFIVARFS/$name-$guid" bs=$(stat -c %s $tmp)
|
||||||
|
rm $tmp
|
||||||
|
|
||||||
|
Loading ACPI SSDTs from configfs
|
||||||
|
================================
|
||||||
|
|
||||||
|
This option allows loading of user defined SSDTs from userspace via the configfs
|
||||||
|
interface. The CONFIG_ACPI_CONFIGFS option must be select and configfs must be
|
||||||
|
mounted. In the following examples, we assume that configfs has been mounted in
|
||||||
|
/config.
|
||||||
|
|
||||||
|
New tables can be loading by creating new directories in /config/acpi/table/ and
|
||||||
|
writing the SSDT aml code in the aml attribute::
|
||||||
|
|
||||||
|
cd /config/acpi/table
|
||||||
|
mkdir my_ssdt
|
||||||
|
cat ~/ssdt.aml > my_ssdt/aml
|
||||||
@@ -864,6 +864,8 @@ All cgroup core files are prefixed with "cgroup."
|
|||||||
populated
|
populated
|
||||||
1 if the cgroup or its descendants contains any live
|
1 if the cgroup or its descendants contains any live
|
||||||
processes; otherwise, 0.
|
processes; otherwise, 0.
|
||||||
|
frozen
|
||||||
|
1 if the cgroup is frozen; otherwise, 0.
|
||||||
|
|
||||||
cgroup.max.descendants
|
cgroup.max.descendants
|
||||||
A read-write single value files. The default is "max".
|
A read-write single value files. The default is "max".
|
||||||
@@ -897,6 +899,31 @@ All cgroup core files are prefixed with "cgroup."
|
|||||||
A dying cgroup can consume system resources not exceeding
|
A dying cgroup can consume system resources not exceeding
|
||||||
limits, which were active at the moment of cgroup deletion.
|
limits, which were active at the moment of cgroup deletion.
|
||||||
|
|
||||||
|
cgroup.freeze
|
||||||
|
A read-write single value file which exists on non-root cgroups.
|
||||||
|
Allowed values are "0" and "1". The default is "0".
|
||||||
|
|
||||||
|
Writing "1" to the file causes freezing of the cgroup and all
|
||||||
|
descendant cgroups. This means that all belonging processes will
|
||||||
|
be stopped and will not run until the cgroup will be explicitly
|
||||||
|
unfrozen. Freezing of the cgroup may take some time; when this action
|
||||||
|
is completed, the "frozen" value in the cgroup.events control file
|
||||||
|
will be updated to "1" and the corresponding notification will be
|
||||||
|
issued.
|
||||||
|
|
||||||
|
A cgroup can be frozen either by its own settings, or by settings
|
||||||
|
of any ancestor cgroups. If any of ancestor cgroups is frozen, the
|
||||||
|
cgroup will remain frozen.
|
||||||
|
|
||||||
|
Processes in the frozen cgroup can be killed by a fatal signal.
|
||||||
|
They also can enter and leave a frozen cgroup: either by an explicit
|
||||||
|
move by a user, or if freezing of the cgroup races with fork().
|
||||||
|
If a process is moved to a frozen cgroup, it stops. If a process is
|
||||||
|
moved out of a frozen cgroup, it becomes running.
|
||||||
|
|
||||||
|
Frozen status of a cgroup doesn't affect any cgroup tree operations:
|
||||||
|
it's possible to delete a frozen (and empty) cgroup, as well as
|
||||||
|
create new sub-cgroups.
|
||||||
|
|
||||||
Controllers
|
Controllers
|
||||||
===========
|
===========
|
||||||
|
|||||||
@@ -91,10 +91,48 @@ Currently Available
|
|||||||
* large block (up to pagesize) support
|
* large block (up to pagesize) support
|
||||||
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
|
||||||
the ordering)
|
the ordering)
|
||||||
|
* Case-insensitive file name lookups
|
||||||
|
|
||||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||||
directory hash tree having a maximum depth of two.
|
directory hash tree having a maximum depth of two.
|
||||||
|
|
||||||
|
case-insensitive file name lookups
|
||||||
|
======================================================
|
||||||
|
|
||||||
|
The case-insensitive file name lookup feature is supported on a
|
||||||
|
per-directory basis, allowing the user to mix case-insensitive and
|
||||||
|
case-sensitive directories in the same filesystem. It is enabled by
|
||||||
|
flipping the +F inode attribute of an empty directory. The
|
||||||
|
case-insensitive string match operation is only defined when we know how
|
||||||
|
text in encoded in a byte sequence. For that reason, in order to enable
|
||||||
|
case-insensitive directories, the filesystem must have the
|
||||||
|
casefold feature, which stores the filesystem-wide encoding
|
||||||
|
model used. By default, the charset adopted is the latest version of
|
||||||
|
Unicode (12.1.0, by the time of this writing), encoded in the UTF-8
|
||||||
|
form. The comparison algorithm is implemented by normalizing the
|
||||||
|
strings to the Canonical decomposition form, as defined by Unicode,
|
||||||
|
followed by a byte per byte comparison.
|
||||||
|
|
||||||
|
The case-awareness is name-preserving on the disk, meaning that the file
|
||||||
|
name provided by userspace is a byte-per-byte match to what is actually
|
||||||
|
written in the disk. The Unicode normalization format used by the
|
||||||
|
kernel is thus an internal representation, and not exposed to the
|
||||||
|
userspace nor to the disk, with the important exception of disk hashes,
|
||||||
|
used on large case-insensitive directories with DX feature. On DX
|
||||||
|
directories, the hash must be calculated using the casefolded version of
|
||||||
|
the filename, meaning that the normalization format used actually has an
|
||||||
|
impact on where the directory entry is stored.
|
||||||
|
|
||||||
|
When we change from viewing filenames as opaque byte sequences to seeing
|
||||||
|
them as encoded strings we need to address what happens when a program
|
||||||
|
tries to create a file with an invalid name. The Unicode subsystem
|
||||||
|
within the kernel leaves the decision of what to do in this case to the
|
||||||
|
filesystem, which select its preferred behavior by enabling/disabling
|
||||||
|
the strict mode. When Ext4 encounters one of those strings and the
|
||||||
|
filesystem did not require strict mode, it falls back to considering the
|
||||||
|
entire string as an opaque byte sequence, which still allows the user to
|
||||||
|
operate on that file, but the case-insensitive lookups won't work.
|
||||||
|
|
||||||
Options
|
Options
|
||||||
=======
|
=======
|
||||||
|
|
||||||
|
|||||||
13
Documentation/admin-guide/hw-vuln/index.rst
Normal file
13
Documentation/admin-guide/hw-vuln/index.rst
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
========================
|
||||||
|
Hardware vulnerabilities
|
||||||
|
========================
|
||||||
|
|
||||||
|
This section describes CPU vulnerabilities and provides an overview of the
|
||||||
|
possible mitigations along with guidance for selecting mitigations if they
|
||||||
|
are configurable at compile, boot or run time.
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
l1tf
|
||||||
|
mds
|
||||||
615
Documentation/admin-guide/hw-vuln/l1tf.rst
Normal file
615
Documentation/admin-guide/hw-vuln/l1tf.rst
Normal file
@@ -0,0 +1,615 @@
|
|||||||
|
L1TF - L1 Terminal Fault
|
||||||
|
========================
|
||||||
|
|
||||||
|
L1 Terminal Fault is a hardware vulnerability which allows unprivileged
|
||||||
|
speculative access to data which is available in the Level 1 Data Cache
|
||||||
|
when the page table entry controlling the virtual address, which is used
|
||||||
|
for the access, has the Present bit cleared or other reserved bits set.
|
||||||
|
|
||||||
|
Affected processors
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
This vulnerability affects a wide range of Intel processors. The
|
||||||
|
vulnerability is not present on:
|
||||||
|
|
||||||
|
- Processors from AMD, Centaur and other non Intel vendors
|
||||||
|
|
||||||
|
- Older processor models, where the CPU family is < 6
|
||||||
|
|
||||||
|
- A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
|
||||||
|
Penwell, Pineview, Silvermont, Airmont, Merrifield)
|
||||||
|
|
||||||
|
- The Intel XEON PHI family
|
||||||
|
|
||||||
|
- Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
|
||||||
|
IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
|
||||||
|
by the Meltdown vulnerability either. These CPUs should become
|
||||||
|
available by end of 2018.
|
||||||
|
|
||||||
|
Whether a processor is affected or not can be read out from the L1TF
|
||||||
|
vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
|
||||||
|
|
||||||
|
Related CVEs
|
||||||
|
------------
|
||||||
|
|
||||||
|
The following CVE entries are related to the L1TF vulnerability:
|
||||||
|
|
||||||
|
============= ================= ==============================
|
||||||
|
CVE-2018-3615 L1 Terminal Fault SGX related aspects
|
||||||
|
CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
|
||||||
|
CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
|
||||||
|
============= ================= ==============================
|
||||||
|
|
||||||
|
Problem
|
||||||
|
-------
|
||||||
|
|
||||||
|
If an instruction accesses a virtual address for which the relevant page
|
||||||
|
table entry (PTE) has the Present bit cleared or other reserved bits set,
|
||||||
|
then speculative execution ignores the invalid PTE and loads the referenced
|
||||||
|
data if it is present in the Level 1 Data Cache, as if the page referenced
|
||||||
|
by the address bits in the PTE was still present and accessible.
|
||||||
|
|
||||||
|
While this is a purely speculative mechanism and the instruction will raise
|
||||||
|
a page fault when it is retired eventually, the pure act of loading the
|
||||||
|
data and making it available to other speculative instructions opens up the
|
||||||
|
opportunity for side channel attacks to unprivileged malicious code,
|
||||||
|
similar to the Meltdown attack.
|
||||||
|
|
||||||
|
While Meltdown breaks the user space to kernel space protection, L1TF
|
||||||
|
allows to attack any physical memory address in the system and the attack
|
||||||
|
works across all protection domains. It allows an attack of SGX and also
|
||||||
|
works from inside virtual machines because the speculation bypasses the
|
||||||
|
extended page table (EPT) protection mechanism.
|
||||||
|
|
||||||
|
|
||||||
|
Attack scenarios
|
||||||
|
----------------
|
||||||
|
|
||||||
|
1. Malicious user space
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Operating Systems store arbitrary information in the address bits of a
|
||||||
|
PTE which is marked non present. This allows a malicious user space
|
||||||
|
application to attack the physical memory to which these PTEs resolve.
|
||||||
|
In some cases user-space can maliciously influence the information
|
||||||
|
encoded in the address bits of the PTE, thus making attacks more
|
||||||
|
deterministic and more practical.
|
||||||
|
|
||||||
|
The Linux kernel contains a mitigation for this attack vector, PTE
|
||||||
|
inversion, which is permanently enabled and has no performance
|
||||||
|
impact. The kernel ensures that the address bits of PTEs, which are not
|
||||||
|
marked present, never point to cacheable physical memory space.
|
||||||
|
|
||||||
|
A system with an up to date kernel is protected against attacks from
|
||||||
|
malicious user space applications.
|
||||||
|
|
||||||
|
2. Malicious guest in a virtual machine
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The fact that L1TF breaks all domain protections allows malicious guest
|
||||||
|
OSes, which can control the PTEs directly, and malicious guest user
|
||||||
|
space applications, which run on an unprotected guest kernel lacking the
|
||||||
|
PTE inversion mitigation for L1TF, to attack physical host memory.
|
||||||
|
|
||||||
|
A special aspect of L1TF in the context of virtualization is symmetric
|
||||||
|
multi threading (SMT). The Intel implementation of SMT is called
|
||||||
|
HyperThreading. The fact that Hyperthreads on the affected processors
|
||||||
|
share the L1 Data Cache (L1D) is important for this. As the flaw allows
|
||||||
|
only to attack data which is present in L1D, a malicious guest running
|
||||||
|
on one Hyperthread can attack the data which is brought into the L1D by
|
||||||
|
the context which runs on the sibling Hyperthread of the same physical
|
||||||
|
core. This context can be host OS, host user space or a different guest.
|
||||||
|
|
||||||
|
If the processor does not support Extended Page Tables, the attack is
|
||||||
|
only possible, when the hypervisor does not sanitize the content of the
|
||||||
|
effective (shadow) page tables.
|
||||||
|
|
||||||
|
While solutions exist to mitigate these attack vectors fully, these
|
||||||
|
mitigations are not enabled by default in the Linux kernel because they
|
||||||
|
can affect performance significantly. The kernel provides several
|
||||||
|
mechanisms which can be utilized to address the problem depending on the
|
||||||
|
deployment scenario. The mitigations, their protection scope and impact
|
||||||
|
are described in the next sections.
|
||||||
|
|
||||||
|
The default mitigations and the rationale for choosing them are explained
|
||||||
|
at the end of this document. See :ref:`default_mitigations`.
|
||||||
|
|
||||||
|
.. _l1tf_sys_info:
|
||||||
|
|
||||||
|
L1TF system information
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The Linux kernel provides a sysfs interface to enumerate the current L1TF
|
||||||
|
status of the system: whether the system is vulnerable, and which
|
||||||
|
mitigations are active. The relevant sysfs file is:
|
||||||
|
|
||||||
|
/sys/devices/system/cpu/vulnerabilities/l1tf
|
||||||
|
|
||||||
|
The possible values in this file are:
|
||||||
|
|
||||||
|
=========================== ===============================
|
||||||
|
'Not affected' The processor is not vulnerable
|
||||||
|
'Mitigation: PTE Inversion' The host protection is active
|
||||||
|
=========================== ===============================
|
||||||
|
|
||||||
|
If KVM/VMX is enabled and the processor is vulnerable then the following
|
||||||
|
information is appended to the 'Mitigation: PTE Inversion' part:
|
||||||
|
|
||||||
|
- SMT status:
|
||||||
|
|
||||||
|
===================== ================
|
||||||
|
'VMX: SMT vulnerable' SMT is enabled
|
||||||
|
'VMX: SMT disabled' SMT is disabled
|
||||||
|
===================== ================
|
||||||
|
|
||||||
|
- L1D Flush mode:
|
||||||
|
|
||||||
|
================================ ====================================
|
||||||
|
'L1D vulnerable' L1D flushing is disabled
|
||||||
|
|
||||||
|
'L1D conditional cache flushes' L1D flush is conditionally enabled
|
||||||
|
|
||||||
|
'L1D cache flushes' L1D flush is unconditionally enabled
|
||||||
|
================================ ====================================
|
||||||
|
|
||||||
|
The resulting grade of protection is discussed in the following sections.
|
||||||
|
|
||||||
|
|
||||||
|
Host mitigation mechanism
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
The kernel is unconditionally protected against L1TF attacks from malicious
|
||||||
|
user space running on the host.
|
||||||
|
|
||||||
|
|
||||||
|
Guest mitigation mechanisms
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
.. _l1d_flush:
|
||||||
|
|
||||||
|
1. L1D flush on VMENTER
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
To make sure that a guest cannot attack data which is present in the L1D
|
||||||
|
the hypervisor flushes the L1D before entering the guest.
|
||||||
|
|
||||||
|
Flushing the L1D evicts not only the data which should not be accessed
|
||||||
|
by a potentially malicious guest, it also flushes the guest
|
||||||
|
data. Flushing the L1D has a performance impact as the processor has to
|
||||||
|
bring the flushed guest data back into the L1D. Depending on the
|
||||||
|
frequency of VMEXIT/VMENTER and the type of computations in the guest
|
||||||
|
performance degradation in the range of 1% to 50% has been observed. For
|
||||||
|
scenarios where guest VMEXIT/VMENTER are rare the performance impact is
|
||||||
|
minimal. Virtio and mechanisms like posted interrupts are designed to
|
||||||
|
confine the VMEXITs to a bare minimum, but specific configurations and
|
||||||
|
application scenarios might still suffer from a high VMEXIT rate.
|
||||||
|
|
||||||
|
The kernel provides two L1D flush modes:
|
||||||
|
- conditional ('cond')
|
||||||
|
- unconditional ('always')
|
||||||
|
|
||||||
|
The conditional mode avoids L1D flushing after VMEXITs which execute
|
||||||
|
only audited code paths before the corresponding VMENTER. These code
|
||||||
|
paths have been verified that they cannot expose secrets or other
|
||||||
|
interesting data to an attacker, but they can leak information about the
|
||||||
|
address space layout of the hypervisor.
|
||||||
|
|
||||||
|
Unconditional mode flushes L1D on all VMENTER invocations and provides
|
||||||
|
maximum protection. It has a higher overhead than the conditional
|
||||||
|
mode. The overhead cannot be quantified correctly as it depends on the
|
||||||
|
workload scenario and the resulting number of VMEXITs.
|
||||||
|
|
||||||
|
The general recommendation is to enable L1D flush on VMENTER. The kernel
|
||||||
|
defaults to conditional mode on affected processors.
|
||||||
|
|
||||||
|
**Note**, that L1D flush does not prevent the SMT problem because the
|
||||||
|
sibling thread will also bring back its data into the L1D which makes it
|
||||||
|
attackable again.
|
||||||
|
|
||||||
|
L1D flush can be controlled by the administrator via the kernel command
|
||||||
|
line and sysfs control files. See :ref:`mitigation_control_command_line`
|
||||||
|
and :ref:`mitigation_control_kvm`.
|
||||||
|
|
||||||
|
.. _guest_confinement:
|
||||||
|
|
||||||
|
2. Guest VCPU confinement to dedicated physical cores
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
To address the SMT problem, it is possible to make a guest or a group of
|
||||||
|
guests affine to one or more physical cores. The proper mechanism for
|
||||||
|
that is to utilize exclusive cpusets to ensure that no other guest or
|
||||||
|
host tasks can run on these cores.
|
||||||
|
|
||||||
|
If only a single guest or related guests run on sibling SMT threads on
|
||||||
|
the same physical core then they can only attack their own memory and
|
||||||
|
restricted parts of the host memory.
|
||||||
|
|
||||||
|
Host memory is attackable, when one of the sibling SMT threads runs in
|
||||||
|
host OS (hypervisor) context and the other in guest context. The amount
|
||||||
|
of valuable information from the host OS context depends on the context
|
||||||
|
which the host OS executes, i.e. interrupts, soft interrupts and kernel
|
||||||
|
threads. The amount of valuable data from these contexts cannot be
|
||||||
|
declared as non-interesting for an attacker without deep inspection of
|
||||||
|
the code.
|
||||||
|
|
||||||
|
**Note**, that assigning guests to a fixed set of physical cores affects
|
||||||
|
the ability of the scheduler to do load balancing and might have
|
||||||
|
negative effects on CPU utilization depending on the hosting
|
||||||
|
scenario. Disabling SMT might be a viable alternative for particular
|
||||||
|
scenarios.
|
||||||
|
|
||||||
|
For further information about confining guests to a single or to a group
|
||||||
|
of cores consult the cpusets documentation:
|
||||||
|
|
||||||
|
https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
|
||||||
|
|
||||||
|
.. _interrupt_isolation:
|
||||||
|
|
||||||
|
3. Interrupt affinity
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Interrupts can be made affine to logical CPUs. This is not universally
|
||||||
|
true because there are types of interrupts which are truly per CPU
|
||||||
|
interrupts, e.g. the local timer interrupt. Aside of that multi queue
|
||||||
|
devices affine their interrupts to single CPUs or groups of CPUs per
|
||||||
|
queue without allowing the administrator to control the affinities.
|
||||||
|
|
||||||
|
Moving the interrupts, which can be affinity controlled, away from CPUs
|
||||||
|
which run untrusted guests, reduces the attack vector space.
|
||||||
|
|
||||||
|
Whether the interrupts with are affine to CPUs, which run untrusted
|
||||||
|
guests, provide interesting data for an attacker depends on the system
|
||||||
|
configuration and the scenarios which run on the system. While for some
|
||||||
|
of the interrupts it can be assumed that they won't expose interesting
|
||||||
|
information beyond exposing hints about the host OS memory layout, there
|
||||||
|
is no way to make general assumptions.
|
||||||
|
|
||||||
|
Interrupt affinity can be controlled by the administrator via the
|
||||||
|
/proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
|
||||||
|
available at:
|
||||||
|
|
||||||
|
https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
|
||||||
|
|
||||||
|
.. _smt_control:
|
||||||
|
|
||||||
|
4. SMT control
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
To prevent the SMT issues of L1TF it might be necessary to disable SMT
|
||||||
|
completely. Disabling SMT can have a significant performance impact, but
|
||||||
|
the impact depends on the hosting scenario and the type of workloads.
|
||||||
|
The impact of disabling SMT needs also to be weighted against the impact
|
||||||
|
of other mitigation solutions like confining guests to dedicated cores.
|
||||||
|
|
||||||
|
The kernel provides a sysfs interface to retrieve the status of SMT and
|
||||||
|
to control it. It also provides a kernel command line interface to
|
||||||
|
control SMT.
|
||||||
|
|
||||||
|
The kernel command line interface consists of the following options:
|
||||||
|
|
||||||
|
=========== ==========================================================
|
||||||
|
nosmt Affects the bring up of the secondary CPUs during boot. The
|
||||||
|
kernel tries to bring all present CPUs online during the
|
||||||
|
boot process. "nosmt" makes sure that from each physical
|
||||||
|
core only one - the so called primary (hyper) thread is
|
||||||
|
activated. Due to a design flaw of Intel processors related
|
||||||
|
to Machine Check Exceptions the non primary siblings have
|
||||||
|
to be brought up at least partially and are then shut down
|
||||||
|
again. "nosmt" can be undone via the sysfs interface.
|
||||||
|
|
||||||
|
nosmt=force Has the same effect as "nosmt" but it does not allow to
|
||||||
|
undo the SMT disable via the sysfs interface.
|
||||||
|
=========== ==========================================================
|
||||||
|
|
||||||
|
The sysfs interface provides two files:
|
||||||
|
|
||||||
|
- /sys/devices/system/cpu/smt/control
|
||||||
|
- /sys/devices/system/cpu/smt/active
|
||||||
|
|
||||||
|
/sys/devices/system/cpu/smt/control:
|
||||||
|
|
||||||
|
This file allows to read out the SMT control state and provides the
|
||||||
|
ability to disable or (re)enable SMT. The possible states are:
|
||||||
|
|
||||||
|
============== ===================================================
|
||||||
|
on SMT is supported by the CPU and enabled. All
|
||||||
|
logical CPUs can be onlined and offlined without
|
||||||
|
restrictions.
|
||||||
|
|
||||||
|
off SMT is supported by the CPU and disabled. Only
|
||||||
|
the so called primary SMT threads can be onlined
|
||||||
|
and offlined without restrictions. An attempt to
|
||||||
|
online a non-primary sibling is rejected
|
||||||
|
|
||||||
|
forceoff Same as 'off' but the state cannot be controlled.
|
||||||
|
Attempts to write to the control file are rejected.
|
||||||
|
|
||||||
|
notsupported The processor does not support SMT. It's therefore
|
||||||
|
not affected by the SMT implications of L1TF.
|
||||||
|
Attempts to write to the control file are rejected.
|
||||||
|
============== ===================================================
|
||||||
|
|
||||||
|
The possible states which can be written into this file to control SMT
|
||||||
|
state are:
|
||||||
|
|
||||||
|
- on
|
||||||
|
- off
|
||||||
|
- forceoff
|
||||||
|
|
||||||
|
/sys/devices/system/cpu/smt/active:
|
||||||
|
|
||||||
|
This file reports whether SMT is enabled and active, i.e. if on any
|
||||||
|
physical core two or more sibling threads are online.
|
||||||
|
|
||||||
|
SMT control is also possible at boot time via the l1tf kernel command
|
||||||
|
line parameter in combination with L1D flush control. See
|
||||||
|
:ref:`mitigation_control_command_line`.
|
||||||
|
|
||||||
|
5. Disabling EPT
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Disabling EPT for virtual machines provides full mitigation for L1TF even
|
||||||
|
with SMT enabled, because the effective page tables for guests are
|
||||||
|
managed and sanitized by the hypervisor. Though disabling EPT has a
|
||||||
|
significant performance impact especially when the Meltdown mitigation
|
||||||
|
KPTI is enabled.
|
||||||
|
|
||||||
|
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
||||||
|
|
||||||
|
There is ongoing research and development for new mitigation mechanisms to
|
||||||
|
address the performance impact of disabling SMT or EPT.
|
||||||
|
|
||||||
|
.. _mitigation_control_command_line:
|
||||||
|
|
||||||
|
Mitigation control on the kernel command line
|
||||||
|
---------------------------------------------
|
||||||
|
|
||||||
|
The kernel command line allows to control the L1TF mitigations at boot
|
||||||
|
time with the option "l1tf=". The valid arguments for this option are:
|
||||||
|
|
||||||
|
============ =============================================================
|
||||||
|
full Provides all available mitigations for the L1TF
|
||||||
|
vulnerability. Disables SMT and enables all mitigations in
|
||||||
|
the hypervisors, i.e. unconditional L1D flushing
|
||||||
|
|
||||||
|
SMT control and L1D flush control via the sysfs interface
|
||||||
|
is still possible after boot. Hypervisors will issue a
|
||||||
|
warning when the first VM is started in a potentially
|
||||||
|
insecure configuration, i.e. SMT enabled or L1D flush
|
||||||
|
disabled.
|
||||||
|
|
||||||
|
full,force Same as 'full', but disables SMT and L1D flush runtime
|
||||||
|
control. Implies the 'nosmt=force' command line option.
|
||||||
|
(i.e. sysfs control of SMT is disabled.)
|
||||||
|
|
||||||
|
flush Leaves SMT enabled and enables the default hypervisor
|
||||||
|
mitigation, i.e. conditional L1D flushing
|
||||||
|
|
||||||
|
SMT control and L1D flush control via the sysfs interface
|
||||||
|
is still possible after boot. Hypervisors will issue a
|
||||||
|
warning when the first VM is started in a potentially
|
||||||
|
insecure configuration, i.e. SMT enabled or L1D flush
|
||||||
|
disabled.
|
||||||
|
|
||||||
|
flush,nosmt Disables SMT and enables the default hypervisor mitigation,
|
||||||
|
i.e. conditional L1D flushing.
|
||||||
|
|
||||||
|
SMT control and L1D flush control via the sysfs interface
|
||||||
|
is still possible after boot. Hypervisors will issue a
|
||||||
|
warning when the first VM is started in a potentially
|
||||||
|
insecure configuration, i.e. SMT enabled or L1D flush
|
||||||
|
disabled.
|
||||||
|
|
||||||
|
flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
|
||||||
|
started in a potentially insecure configuration.
|
||||||
|
|
||||||
|
off Disables hypervisor mitigations and doesn't emit any
|
||||||
|
warnings.
|
||||||
|
It also drops the swap size and available RAM limit restrictions
|
||||||
|
on both hypervisor and bare metal.
|
||||||
|
|
||||||
|
============ =============================================================
|
||||||
|
|
||||||
|
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
|
||||||
|
|
||||||
|
|
||||||
|
.. _mitigation_control_kvm:
|
||||||
|
|
||||||
|
Mitigation control for KVM - module parameter
|
||||||
|
-------------------------------------------------------------
|
||||||
|
|
||||||
|
The KVM hypervisor mitigation mechanism, flushing the L1D cache when
|
||||||
|
entering a guest, can be controlled with a module parameter.
|
||||||
|
|
||||||
|
The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
|
||||||
|
following arguments:
|
||||||
|
|
||||||
|
============ ==============================================================
|
||||||
|
always L1D cache flush on every VMENTER.
|
||||||
|
|
||||||
|
cond Flush L1D on VMENTER only when the code between VMEXIT and
|
||||||
|
VMENTER can leak host memory which is considered
|
||||||
|
interesting for an attacker. This still can leak host memory
|
||||||
|
which allows e.g. to determine the hosts address space layout.
|
||||||
|
|
||||||
|
never Disables the mitigation
|
||||||
|
============ ==============================================================
|
||||||
|
|
||||||
|
The parameter can be provided on the kernel command line, as a module
|
||||||
|
parameter when loading the modules and at runtime modified via the sysfs
|
||||||
|
file:
|
||||||
|
|
||||||
|
/sys/module/kvm_intel/parameters/vmentry_l1d_flush
|
||||||
|
|
||||||
|
The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
|
||||||
|
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
||||||
|
module parameter is ignored and writes to the sysfs file are rejected.
|
||||||
|
|
||||||
|
.. _mitigation_selection:
|
||||||
|
|
||||||
|
Mitigation selection guide
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
1. No virtualization in use
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The system is protected by the kernel unconditionally and no further
|
||||||
|
action is required.
|
||||||
|
|
||||||
|
2. Virtualization with trusted guests
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
If the guest comes from a trusted source and the guest OS kernel is
|
||||||
|
guaranteed to have the L1TF mitigations in place the system is fully
|
||||||
|
protected against L1TF and no further action is required.
|
||||||
|
|
||||||
|
To avoid the overhead of the default L1D flushing on VMENTER the
|
||||||
|
administrator can disable the flushing via the kernel command line and
|
||||||
|
sysfs control files. See :ref:`mitigation_control_command_line` and
|
||||||
|
:ref:`mitigation_control_kvm`.
|
||||||
|
|
||||||
|
|
||||||
|
3. Virtualization with untrusted guests
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
3.1. SMT not supported or disabled
|
||||||
|
""""""""""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
If SMT is not supported by the processor or disabled in the BIOS or by
|
||||||
|
the kernel, it's only required to enforce L1D flushing on VMENTER.
|
||||||
|
|
||||||
|
Conditional L1D flushing is the default behaviour and can be tuned. See
|
||||||
|
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
||||||
|
|
||||||
|
3.2. EPT not supported or disabled
|
||||||
|
""""""""""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
If EPT is not supported by the processor or disabled in the hypervisor,
|
||||||
|
the system is fully protected. SMT can stay enabled and L1D flushing on
|
||||||
|
VMENTER is not required.
|
||||||
|
|
||||||
|
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
||||||
|
|
||||||
|
3.3. SMT and EPT supported and active
|
||||||
|
"""""""""""""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
If SMT and EPT are supported and active then various degrees of
|
||||||
|
mitigations can be employed:
|
||||||
|
|
||||||
|
- L1D flushing on VMENTER:
|
||||||
|
|
||||||
|
L1D flushing on VMENTER is the minimal protection requirement, but it
|
||||||
|
is only potent in combination with other mitigation methods.
|
||||||
|
|
||||||
|
Conditional L1D flushing is the default behaviour and can be tuned. See
|
||||||
|
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
||||||
|
|
||||||
|
- Guest confinement:
|
||||||
|
|
||||||
|
Confinement of guests to a single or a group of physical cores which
|
||||||
|
are not running any other processes, can reduce the attack surface
|
||||||
|
significantly, but interrupts, soft interrupts and kernel threads can
|
||||||
|
still expose valuable data to a potential attacker. See
|
||||||
|
:ref:`guest_confinement`.
|
||||||
|
|
||||||
|
- Interrupt isolation:
|
||||||
|
|
||||||
|
Isolating the guest CPUs from interrupts can reduce the attack surface
|
||||||
|
further, but still allows a malicious guest to explore a limited amount
|
||||||
|
of host physical memory. This can at least be used to gain knowledge
|
||||||
|
about the host address space layout. The interrupts which have a fixed
|
||||||
|
affinity to the CPUs which run the untrusted guests can depending on
|
||||||
|
the scenario still trigger soft interrupts and schedule kernel threads
|
||||||
|
which might expose valuable information. See
|
||||||
|
:ref:`interrupt_isolation`.
|
||||||
|
|
||||||
|
The above three mitigation methods combined can provide protection to a
|
||||||
|
certain degree, but the risk of the remaining attack surface has to be
|
||||||
|
carefully analyzed. For full protection the following methods are
|
||||||
|
available:
|
||||||
|
|
||||||
|
- Disabling SMT:
|
||||||
|
|
||||||
|
Disabling SMT and enforcing the L1D flushing provides the maximum
|
||||||
|
amount of protection. This mitigation is not depending on any of the
|
||||||
|
above mitigation methods.
|
||||||
|
|
||||||
|
SMT control and L1D flushing can be tuned by the command line
|
||||||
|
parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
|
||||||
|
time with the matching sysfs control files. See :ref:`smt_control`,
|
||||||
|
:ref:`mitigation_control_command_line` and
|
||||||
|
:ref:`mitigation_control_kvm`.
|
||||||
|
|
||||||
|
- Disabling EPT:
|
||||||
|
|
||||||
|
Disabling EPT provides the maximum amount of protection as well. It is
|
||||||
|
not depending on any of the above mitigation methods. SMT can stay
|
||||||
|
enabled and L1D flushing is not required, but the performance impact is
|
||||||
|
significant.
|
||||||
|
|
||||||
|
EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
|
||||||
|
parameter.
|
||||||
|
|
||||||
|
3.4. Nested virtual machines
|
||||||
|
""""""""""""""""""""""""""""
|
||||||
|
|
||||||
|
When nested virtualization is in use, three operating systems are involved:
|
||||||
|
the bare metal hypervisor, the nested hypervisor and the nested virtual
|
||||||
|
machine. VMENTER operations from the nested hypervisor into the nested
|
||||||
|
guest will always be processed by the bare metal hypervisor. If KVM is the
|
||||||
|
bare metal hypervisor it will:
|
||||||
|
|
||||||
|
- Flush the L1D cache on every switch from the nested hypervisor to the
|
||||||
|
nested virtual machine, so that the nested hypervisor's secrets are not
|
||||||
|
exposed to the nested virtual machine;
|
||||||
|
|
||||||
|
- Flush the L1D cache on every switch from the nested virtual machine to
|
||||||
|
the nested hypervisor; this is a complex operation, and flushing the L1D
|
||||||
|
cache avoids that the bare metal hypervisor's secrets are exposed to the
|
||||||
|
nested virtual machine;
|
||||||
|
|
||||||
|
- Instruct the nested hypervisor to not perform any L1D cache flush. This
|
||||||
|
is an optimization to avoid double L1D flushing.
|
||||||
|
|
||||||
|
|
||||||
|
.. _default_mitigations:
|
||||||
|
|
||||||
|
Default mitigations
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
The kernel default mitigations for vulnerable processors are:
|
||||||
|
|
||||||
|
- PTE inversion to protect against malicious user space. This is done
|
||||||
|
unconditionally and cannot be controlled. The swap storage is limited
|
||||||
|
to ~16TB.
|
||||||
|
|
||||||
|
- L1D conditional flushing on VMENTER when EPT is enabled for
|
||||||
|
a guest.
|
||||||
|
|
||||||
|
The kernel does not by default enforce the disabling of SMT, which leaves
|
||||||
|
SMT systems vulnerable when running untrusted guests with EPT enabled.
|
||||||
|
|
||||||
|
The rationale for this choice is:
|
||||||
|
|
||||||
|
- Force disabling SMT can break existing setups, especially with
|
||||||
|
unattended updates.
|
||||||
|
|
||||||
|
- If regular users run untrusted guests on their machine, then L1TF is
|
||||||
|
just an add on to other malware which might be embedded in an untrusted
|
||||||
|
guest, e.g. spam-bots or attacks on the local network.
|
||||||
|
|
||||||
|
There is no technical way to prevent a user from running untrusted code
|
||||||
|
on their machines blindly.
|
||||||
|
|
||||||
|
- It's technically extremely unlikely and from today's knowledge even
|
||||||
|
impossible that L1TF can be exploited via the most popular attack
|
||||||
|
mechanisms like JavaScript because these mechanisms have no way to
|
||||||
|
control PTEs. If this would be possible and not other mitigation would
|
||||||
|
be possible, then the default might be different.
|
||||||
|
|
||||||
|
- The administrators of cloud and hosting setups have to carefully
|
||||||
|
analyze the risk for their scenarios and make the appropriate
|
||||||
|
mitigation choices, which might even vary across their deployed
|
||||||
|
machines and also result in other changes of their overall setup.
|
||||||
|
There is no way for the kernel to provide a sensible default for this
|
||||||
|
kind of scenarios.
|
||||||
308
Documentation/admin-guide/hw-vuln/mds.rst
Normal file
308
Documentation/admin-guide/hw-vuln/mds.rst
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
MDS - Microarchitectural Data Sampling
|
||||||
|
======================================
|
||||||
|
|
||||||
|
Microarchitectural Data Sampling is a hardware vulnerability which allows
|
||||||
|
unprivileged speculative access to data which is available in various CPU
|
||||||
|
internal buffers.
|
||||||
|
|
||||||
|
Affected processors
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
This vulnerability affects a wide range of Intel processors. The
|
||||||
|
vulnerability is not present on:
|
||||||
|
|
||||||
|
- Processors from AMD, Centaur and other non Intel vendors
|
||||||
|
|
||||||
|
- Older processor models, where the CPU family is < 6
|
||||||
|
|
||||||
|
- Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
|
||||||
|
|
||||||
|
- Intel processors which have the ARCH_CAP_MDS_NO bit set in the
|
||||||
|
IA32_ARCH_CAPABILITIES MSR.
|
||||||
|
|
||||||
|
Whether a processor is affected or not can be read out from the MDS
|
||||||
|
vulnerability file in sysfs. See :ref:`mds_sys_info`.
|
||||||
|
|
||||||
|
Not all processors are affected by all variants of MDS, but the mitigation
|
||||||
|
is identical for all of them so the kernel treats them as a single
|
||||||
|
vulnerability.
|
||||||
|
|
||||||
|
Related CVEs
|
||||||
|
------------
|
||||||
|
|
||||||
|
The following CVE entries are related to the MDS vulnerability:
|
||||||
|
|
||||||
|
============== ===== ===================================================
|
||||||
|
CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
|
||||||
|
CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
|
||||||
|
CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
|
||||||
|
CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
|
||||||
|
============== ===== ===================================================
|
||||||
|
|
||||||
|
Problem
|
||||||
|
-------
|
||||||
|
|
||||||
|
When performing store, load, L1 refill operations, processors write data
|
||||||
|
into temporary microarchitectural structures (buffers). The data in the
|
||||||
|
buffer can be forwarded to load operations as an optimization.
|
||||||
|
|
||||||
|
Under certain conditions, usually a fault/assist caused by a load
|
||||||
|
operation, data unrelated to the load memory address can be speculatively
|
||||||
|
forwarded from the buffers. Because the load operation causes a fault or
|
||||||
|
assist and its result will be discarded, the forwarded data will not cause
|
||||||
|
incorrect program execution or state changes. But a malicious operation
|
||||||
|
may be able to forward this speculative data to a disclosure gadget which
|
||||||
|
allows in turn to infer the value via a cache side channel attack.
|
||||||
|
|
||||||
|
Because the buffers are potentially shared between Hyper-Threads cross
|
||||||
|
Hyper-Thread attacks are possible.
|
||||||
|
|
||||||
|
Deeper technical information is available in the MDS specific x86
|
||||||
|
architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
|
||||||
|
|
||||||
|
|
||||||
|
Attack scenarios
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Attacks against the MDS vulnerabilities can be mounted from malicious non
|
||||||
|
priviledged user space applications running on hosts or guest. Malicious
|
||||||
|
guest OSes can obviously mount attacks as well.
|
||||||
|
|
||||||
|
Contrary to other speculation based vulnerabilities the MDS vulnerability
|
||||||
|
does not allow the attacker to control the memory target address. As a
|
||||||
|
consequence the attacks are purely sampling based, but as demonstrated with
|
||||||
|
the TLBleed attack samples can be postprocessed successfully.
|
||||||
|
|
||||||
|
Web-Browsers
|
||||||
|
^^^^^^^^^^^^
|
||||||
|
|
||||||
|
It's unclear whether attacks through Web-Browsers are possible at
|
||||||
|
all. The exploitation through Java-Script is considered very unlikely,
|
||||||
|
but other widely used web technologies like Webassembly could possibly be
|
||||||
|
abused.
|
||||||
|
|
||||||
|
|
||||||
|
.. _mds_sys_info:
|
||||||
|
|
||||||
|
MDS system information
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The Linux kernel provides a sysfs interface to enumerate the current MDS
|
||||||
|
status of the system: whether the system is vulnerable, and which
|
||||||
|
mitigations are active. The relevant sysfs file is:
|
||||||
|
|
||||||
|
/sys/devices/system/cpu/vulnerabilities/mds
|
||||||
|
|
||||||
|
The possible values in this file are:
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
|
||||||
|
* - 'Not affected'
|
||||||
|
- The processor is not vulnerable
|
||||||
|
* - 'Vulnerable'
|
||||||
|
- The processor is vulnerable, but no mitigation enabled
|
||||||
|
* - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
||||||
|
- The processor is vulnerable but microcode is not updated.
|
||||||
|
|
||||||
|
The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
|
||||||
|
* - 'Mitigation: Clear CPU buffers'
|
||||||
|
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||||
|
enabled.
|
||||||
|
|
||||||
|
If the processor is vulnerable then the following information is appended
|
||||||
|
to the above information:
|
||||||
|
|
||||||
|
======================== ============================================
|
||||||
|
'SMT vulnerable' SMT is enabled
|
||||||
|
'SMT mitigated' SMT is enabled and mitigated
|
||||||
|
'SMT disabled' SMT is disabled
|
||||||
|
'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
|
||||||
|
======================== ============================================
|
||||||
|
|
||||||
|
.. _vmwerv:
|
||||||
|
|
||||||
|
Best effort mitigation mode
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
If the processor is vulnerable, but the availability of the microcode based
|
||||||
|
mitigation mechanism is not advertised via CPUID the kernel selects a best
|
||||||
|
effort mitigation mode. This mode invokes the mitigation instructions
|
||||||
|
without a guarantee that they clear the CPU buffers.
|
||||||
|
|
||||||
|
This is done to address virtualization scenarios where the host has the
|
||||||
|
microcode update applied, but the hypervisor is not yet updated to expose
|
||||||
|
the CPUID to the guest. If the host has updated microcode the protection
|
||||||
|
takes effect otherwise a few cpu cycles are wasted pointlessly.
|
||||||
|
|
||||||
|
The state in the mds sysfs file reflects this situation accordingly.
|
||||||
|
|
||||||
|
|
||||||
|
Mitigation mechanism
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
The kernel detects the affected CPUs and the presence of the microcode
|
||||||
|
which is required.
|
||||||
|
|
||||||
|
If a CPU is affected and the microcode is available, then the kernel
|
||||||
|
enables the mitigation by default. The mitigation can be controlled at boot
|
||||||
|
time via a kernel command line option. See
|
||||||
|
:ref:`mds_mitigation_control_command_line`.
|
||||||
|
|
||||||
|
.. _cpu_buffer_clear:
|
||||||
|
|
||||||
|
CPU buffer clearing
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The mitigation for MDS clears the affected CPU buffers on return to user
|
||||||
|
space and when entering a guest.
|
||||||
|
|
||||||
|
If SMT is enabled it also clears the buffers on idle entry when the CPU
|
||||||
|
is only affected by MSBDS and not any other MDS variant, because the
|
||||||
|
other variants cannot be protected against cross Hyper-Thread attacks.
|
||||||
|
|
||||||
|
For CPUs which are only affected by MSBDS the user space, guest and idle
|
||||||
|
transition mitigations are sufficient and SMT is not affected.
|
||||||
|
|
||||||
|
.. _virt_mechanism:
|
||||||
|
|
||||||
|
Virtualization mitigation
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The protection for host to guest transition depends on the L1TF
|
||||||
|
vulnerability of the CPU:
|
||||||
|
|
||||||
|
- CPU is affected by L1TF:
|
||||||
|
|
||||||
|
If the L1D flush mitigation is enabled and up to date microcode is
|
||||||
|
available, the L1D flush mitigation is automatically protecting the
|
||||||
|
guest transition.
|
||||||
|
|
||||||
|
If the L1D flush mitigation is disabled then the MDS mitigation is
|
||||||
|
invoked explicit when the host MDS mitigation is enabled.
|
||||||
|
|
||||||
|
For details on L1TF and virtualization see:
|
||||||
|
:ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
|
||||||
|
|
||||||
|
- CPU is not affected by L1TF:
|
||||||
|
|
||||||
|
CPU buffers are flushed before entering the guest when the host MDS
|
||||||
|
mitigation is enabled.
|
||||||
|
|
||||||
|
The resulting MDS protection matrix for the host to guest transition:
|
||||||
|
|
||||||
|
============ ===== ============= ============ =================
|
||||||
|
L1TF MDS VMX-L1FLUSH Host MDS MDS-State
|
||||||
|
|
||||||
|
Don't care No Don't care N/A Not affected
|
||||||
|
|
||||||
|
Yes Yes Disabled Off Vulnerable
|
||||||
|
|
||||||
|
Yes Yes Disabled Full Mitigated
|
||||||
|
|
||||||
|
Yes Yes Enabled Don't care Mitigated
|
||||||
|
|
||||||
|
No Yes N/A Off Vulnerable
|
||||||
|
|
||||||
|
No Yes N/A Full Mitigated
|
||||||
|
============ ===== ============= ============ =================
|
||||||
|
|
||||||
|
This only covers the host to guest transition, i.e. prevents leakage from
|
||||||
|
host to guest, but does not protect the guest internally. Guests need to
|
||||||
|
have their own protections.
|
||||||
|
|
||||||
|
.. _xeon_phi:
|
||||||
|
|
||||||
|
XEON PHI specific considerations
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The XEON PHI processor family is affected by MSBDS which can be exploited
|
||||||
|
cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
|
||||||
|
to use MWAIT in user space (Ring 3) which opens an potential attack vector
|
||||||
|
for malicious user space. The exposure can be disabled on the kernel
|
||||||
|
command line with the 'ring3mwait=disable' command line option.
|
||||||
|
|
||||||
|
XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
|
||||||
|
before the CPU enters a idle state. As XEON PHI is not affected by L1TF
|
||||||
|
either disabling SMT is not required for full protection.
|
||||||
|
|
||||||
|
.. _mds_smt_control:
|
||||||
|
|
||||||
|
SMT control
|
||||||
|
^^^^^^^^^^^
|
||||||
|
|
||||||
|
All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
|
||||||
|
means on CPUs which are affected by MFBDS or MLPDS it is necessary to
|
||||||
|
disable SMT for full protection. These are most of the affected CPUs; the
|
||||||
|
exception is XEON PHI, see :ref:`xeon_phi`.
|
||||||
|
|
||||||
|
Disabling SMT can have a significant performance impact, but the impact
|
||||||
|
depends on the type of workloads.
|
||||||
|
|
||||||
|
See the relevant chapter in the L1TF mitigation documentation for details:
|
||||||
|
:ref:`Documentation/admin-guide/hw-vuln/l1tf.rst <smt_control>`.
|
||||||
|
|
||||||
|
|
||||||
|
.. _mds_mitigation_control_command_line:
|
||||||
|
|
||||||
|
Mitigation control on the kernel command line
|
||||||
|
---------------------------------------------
|
||||||
|
|
||||||
|
The kernel command line allows to control the MDS mitigations at boot
|
||||||
|
time with the option "mds=". The valid arguments for this option are:
|
||||||
|
|
||||||
|
============ =============================================================
|
||||||
|
full If the CPU is vulnerable, enable all available mitigations
|
||||||
|
for the MDS vulnerability, CPU buffer clearing on exit to
|
||||||
|
userspace and when entering a VM. Idle transitions are
|
||||||
|
protected as well if SMT is enabled.
|
||||||
|
|
||||||
|
It does not automatically disable SMT.
|
||||||
|
|
||||||
|
full,nosmt The same as mds=full, with SMT disabled on vulnerable
|
||||||
|
CPUs. This is the complete mitigation.
|
||||||
|
|
||||||
|
off Disables MDS mitigations completely.
|
||||||
|
|
||||||
|
============ =============================================================
|
||||||
|
|
||||||
|
Not specifying this option is equivalent to "mds=full".
|
||||||
|
|
||||||
|
|
||||||
|
Mitigation selection guide
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
1. Trusted userspace
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
If all userspace applications are from a trusted source and do not
|
||||||
|
execute untrusted code which is supplied externally, then the mitigation
|
||||||
|
can be disabled.
|
||||||
|
|
||||||
|
|
||||||
|
2. Virtualization with trusted guests
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The same considerations as above versus trusted user space apply.
|
||||||
|
|
||||||
|
3. Virtualization with untrusted guests
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The protection depends on the state of the L1TF mitigations.
|
||||||
|
See :ref:`virt_mechanism`.
|
||||||
|
|
||||||
|
If the MDS mitigation is enabled and SMT is disabled, guest to host and
|
||||||
|
guest to guest attacks are prevented.
|
||||||
|
|
||||||
|
.. _mds_default_mitigations:
|
||||||
|
|
||||||
|
Default mitigations
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
The kernel default mitigations for vulnerable processors are:
|
||||||
|
|
||||||
|
- Enable CPU buffer clearing
|
||||||
|
|
||||||
|
The kernel does not by default enforce the disabling of SMT, which leaves
|
||||||
|
SMT systems vulnerable when running untrusted code. The same rationale as
|
||||||
|
for L1TF applies.
|
||||||
|
See :ref:`Documentation/admin-guide/hw-vuln//l1tf.rst <default_mitigations>`.
|
||||||
@@ -17,14 +17,12 @@ etc.
|
|||||||
kernel-parameters
|
kernel-parameters
|
||||||
devices
|
devices
|
||||||
|
|
||||||
This section describes CPU vulnerabilities and provides an overview of the
|
This section describes CPU vulnerabilities and their mitigations.
|
||||||
possible mitigations along with guidance for selecting mitigations if they
|
|
||||||
are configurable at compile, boot or run time.
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
l1tf
|
hw-vuln/index
|
||||||
|
|
||||||
Here is a set of documents aimed at users who are trying to track down
|
Here is a set of documents aimed at users who are trying to track down
|
||||||
problems and bugs in particular.
|
problems and bugs in particular.
|
||||||
@@ -77,6 +75,7 @@ configure specific aspects of kernel behavior to your liking.
|
|||||||
LSM/index
|
LSM/index
|
||||||
mm/index
|
mm/index
|
||||||
perf-security
|
perf-security
|
||||||
|
acpi/index
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ parameter is applicable::
|
|||||||
APIC APIC support is enabled.
|
APIC APIC support is enabled.
|
||||||
APM Advanced Power Management support is enabled.
|
APM Advanced Power Management support is enabled.
|
||||||
ARM ARM architecture is enabled.
|
ARM ARM architecture is enabled.
|
||||||
|
ARM64 ARM64 architecture is enabled.
|
||||||
AX25 Appropriate AX.25 support is enabled.
|
AX25 Appropriate AX.25 support is enabled.
|
||||||
CLK Common clock infrastructure is enabled.
|
CLK Common clock infrastructure is enabled.
|
||||||
CMA Contiguous Memory Area support is enabled.
|
CMA Contiguous Memory Area support is enabled.
|
||||||
|
|||||||
@@ -704,8 +704,11 @@
|
|||||||
upon panic. This parameter reserves the physical
|
upon panic. This parameter reserves the physical
|
||||||
memory region [offset, offset + size] for that kernel
|
memory region [offset, offset + size] for that kernel
|
||||||
image. If '@offset' is omitted, then a suitable offset
|
image. If '@offset' is omitted, then a suitable offset
|
||||||
is selected automatically. Check
|
is selected automatically.
|
||||||
Documentation/kdump/kdump.txt for further details.
|
[KNL, x86_64] select a region under 4G first, and
|
||||||
|
fall back to reserve region above 4G when '@offset'
|
||||||
|
hasn't been specified.
|
||||||
|
See Documentation/kdump/kdump.txt for further details.
|
||||||
|
|
||||||
crashkernel=range1:size1[,range2:size2,...][@offset]
|
crashkernel=range1:size1[,range2:size2,...][@offset]
|
||||||
[KNL] Same as above, but depends on the memory
|
[KNL] Same as above, but depends on the memory
|
||||||
@@ -1585,7 +1588,7 @@
|
|||||||
Format: { "off" | "enforce" | "fix" | "log" }
|
Format: { "off" | "enforce" | "fix" | "log" }
|
||||||
default: "enforce"
|
default: "enforce"
|
||||||
|
|
||||||
ima_appraise_tcb [IMA]
|
ima_appraise_tcb [IMA] Deprecated. Use ima_policy= instead.
|
||||||
The builtin appraise policy appraises all files
|
The builtin appraise policy appraises all files
|
||||||
owned by uid=0.
|
owned by uid=0.
|
||||||
|
|
||||||
@@ -1612,8 +1615,7 @@
|
|||||||
uid=0.
|
uid=0.
|
||||||
|
|
||||||
The "appraise_tcb" policy appraises the integrity of
|
The "appraise_tcb" policy appraises the integrity of
|
||||||
all files owned by root. (This is the equivalent
|
all files owned by root.
|
||||||
of ima_appraise_tcb.)
|
|
||||||
|
|
||||||
The "secure_boot" policy appraises the integrity
|
The "secure_boot" policy appraises the integrity
|
||||||
of files (eg. kexec kernel image, kernel modules,
|
of files (eg. kexec kernel image, kernel modules,
|
||||||
@@ -1828,6 +1830,9 @@
|
|||||||
ip= [IP_PNP]
|
ip= [IP_PNP]
|
||||||
See Documentation/filesystems/nfs/nfsroot.txt.
|
See Documentation/filesystems/nfs/nfsroot.txt.
|
||||||
|
|
||||||
|
ipcmni_extend [KNL] Extend the maximum number of unique System V
|
||||||
|
IPC identifiers from 32,768 to 16,777,216.
|
||||||
|
|
||||||
irqaffinity= [SMP] Set the default irq affinity mask
|
irqaffinity= [SMP] Set the default irq affinity mask
|
||||||
The argument is a cpu list, as described above.
|
The argument is a cpu list, as described above.
|
||||||
|
|
||||||
@@ -2141,7 +2146,7 @@
|
|||||||
|
|
||||||
Default is 'flush'.
|
Default is 'flush'.
|
||||||
|
|
||||||
For details see: Documentation/admin-guide/l1tf.rst
|
For details see: Documentation/admin-guide/hw-vuln/l1tf.rst
|
||||||
|
|
||||||
l2cr= [PPC]
|
l2cr= [PPC]
|
||||||
|
|
||||||
@@ -2387,6 +2392,32 @@
|
|||||||
Format: <first>,<last>
|
Format: <first>,<last>
|
||||||
Specifies range of consoles to be captured by the MDA.
|
Specifies range of consoles to be captured by the MDA.
|
||||||
|
|
||||||
|
mds= [X86,INTEL]
|
||||||
|
Control mitigation for the Micro-architectural Data
|
||||||
|
Sampling (MDS) vulnerability.
|
||||||
|
|
||||||
|
Certain CPUs are vulnerable to an exploit against CPU
|
||||||
|
internal buffers which can forward information to a
|
||||||
|
disclosure gadget under certain conditions.
|
||||||
|
|
||||||
|
In vulnerable processors, the speculatively
|
||||||
|
forwarded data can be used in a cache side channel
|
||||||
|
attack, to access data to which the attacker does
|
||||||
|
not have direct access.
|
||||||
|
|
||||||
|
This parameter controls the MDS mitigation. The
|
||||||
|
options are:
|
||||||
|
|
||||||
|
full - Enable MDS mitigation on vulnerable CPUs
|
||||||
|
full,nosmt - Enable MDS mitigation and disable
|
||||||
|
SMT on vulnerable CPUs
|
||||||
|
off - Unconditionally disable MDS mitigation
|
||||||
|
|
||||||
|
Not specifying this option is equivalent to
|
||||||
|
mds=full.
|
||||||
|
|
||||||
|
For details see: Documentation/admin-guide/hw-vuln/mds.rst
|
||||||
|
|
||||||
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
||||||
Amount of memory to be used when the kernel is not able
|
Amount of memory to be used when the kernel is not able
|
||||||
to see the whole system memory or for test.
|
to see the whole system memory or for test.
|
||||||
@@ -2544,6 +2575,42 @@
|
|||||||
in the "bleeding edge" mini2440 support kernel at
|
in the "bleeding edge" mini2440 support kernel at
|
||||||
http://repo.or.cz/w/linux-2.6/mini2440.git
|
http://repo.or.cz/w/linux-2.6/mini2440.git
|
||||||
|
|
||||||
|
mitigations=
|
||||||
|
[X86,PPC,S390,ARM64] Control optional mitigations for
|
||||||
|
CPU vulnerabilities. This is a set of curated,
|
||||||
|
arch-independent options, each of which is an
|
||||||
|
aggregation of existing arch-specific options.
|
||||||
|
|
||||||
|
off
|
||||||
|
Disable all optional CPU mitigations. This
|
||||||
|
improves system performance, but it may also
|
||||||
|
expose users to several CPU vulnerabilities.
|
||||||
|
Equivalent to: nopti [X86,PPC]
|
||||||
|
kpti=0 [ARM64]
|
||||||
|
nospectre_v1 [PPC]
|
||||||
|
nobp=0 [S390]
|
||||||
|
nospectre_v2 [X86,PPC,S390,ARM64]
|
||||||
|
spectre_v2_user=off [X86]
|
||||||
|
spec_store_bypass_disable=off [X86,PPC]
|
||||||
|
ssbd=force-off [ARM64]
|
||||||
|
l1tf=off [X86]
|
||||||
|
mds=off [X86]
|
||||||
|
|
||||||
|
auto (default)
|
||||||
|
Mitigate all CPU vulnerabilities, but leave SMT
|
||||||
|
enabled, even if it's vulnerable. This is for
|
||||||
|
users who don't want to be surprised by SMT
|
||||||
|
getting disabled across kernel upgrades, or who
|
||||||
|
have other ways of avoiding SMT-based attacks.
|
||||||
|
Equivalent to: (default behavior)
|
||||||
|
|
||||||
|
auto,nosmt
|
||||||
|
Mitigate all CPU vulnerabilities, disabling SMT
|
||||||
|
if needed. This is for users who always want to
|
||||||
|
be fully mitigated, even if it means losing SMT.
|
||||||
|
Equivalent to: l1tf=flush,nosmt [X86]
|
||||||
|
mds=full,nosmt [X86]
|
||||||
|
|
||||||
mminit_loglevel=
|
mminit_loglevel=
|
||||||
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
||||||
parameter allows control of the logging verbosity for
|
parameter allows control of the logging verbosity for
|
||||||
@@ -2839,11 +2906,11 @@
|
|||||||
noexec=on: enable non-executable mappings (default)
|
noexec=on: enable non-executable mappings (default)
|
||||||
noexec=off: disable non-executable mappings
|
noexec=off: disable non-executable mappings
|
||||||
|
|
||||||
nosmap [X86]
|
nosmap [X86,PPC]
|
||||||
Disable SMAP (Supervisor Mode Access Prevention)
|
Disable SMAP (Supervisor Mode Access Prevention)
|
||||||
even if it is supported by processor.
|
even if it is supported by processor.
|
||||||
|
|
||||||
nosmep [X86]
|
nosmep [X86,PPC]
|
||||||
Disable SMEP (Supervisor Mode Execution Prevention)
|
Disable SMEP (Supervisor Mode Execution Prevention)
|
||||||
even if it is supported by processor.
|
even if it is supported by processor.
|
||||||
|
|
||||||
@@ -2873,10 +2940,10 @@
|
|||||||
check bypass). With this option data leaks are possible
|
check bypass). With this option data leaks are possible
|
||||||
in the system.
|
in the system.
|
||||||
|
|
||||||
nospectre_v2 [X86,PPC_FSL_BOOK3E] Disable all mitigations for the Spectre variant 2
|
nospectre_v2 [X86,PPC_FSL_BOOK3E,ARM64] Disable all mitigations for
|
||||||
(indirect branch prediction) vulnerability. System may
|
the Spectre variant 2 (indirect branch prediction)
|
||||||
allow data leaks with this option, which is equivalent
|
vulnerability. System may allow data leaks with this
|
||||||
to spectre_v2=off.
|
option.
|
||||||
|
|
||||||
nospec_store_bypass_disable
|
nospec_store_bypass_disable
|
||||||
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
||||||
@@ -3110,6 +3177,16 @@
|
|||||||
This will also cause panics on machine check exceptions.
|
This will also cause panics on machine check exceptions.
|
||||||
Useful together with panic=30 to trigger a reboot.
|
Useful together with panic=30 to trigger a reboot.
|
||||||
|
|
||||||
|
page_alloc.shuffle=
|
||||||
|
[KNL] Boolean flag to control whether the page allocator
|
||||||
|
should randomize its free lists. The randomization may
|
||||||
|
be automatically enabled if the kernel detects it is
|
||||||
|
running on a platform with a direct-mapped memory-side
|
||||||
|
cache, and this parameter can be used to
|
||||||
|
override/disable that behavior. The state of the flag
|
||||||
|
can be read from sysfs at:
|
||||||
|
/sys/module/page_alloc/parameters/shuffle.
|
||||||
|
|
||||||
page_owner= [KNL] Boot-time page_owner enabling option.
|
page_owner= [KNL] Boot-time page_owner enabling option.
|
||||||
Storage of the information about who allocated
|
Storage of the information about who allocated
|
||||||
each page is disabled in default. With this switch,
|
each page is disabled in default. With this switch,
|
||||||
@@ -3135,6 +3212,7 @@
|
|||||||
bit 2: print timer info
|
bit 2: print timer info
|
||||||
bit 3: print locks info if CONFIG_LOCKDEP is on
|
bit 3: print locks info if CONFIG_LOCKDEP is on
|
||||||
bit 4: print ftrace buffer
|
bit 4: print ftrace buffer
|
||||||
|
bit 5: print all printk messages in buffer
|
||||||
|
|
||||||
panic_on_warn panic() instead of WARN(). Useful to cause kdump
|
panic_on_warn panic() instead of WARN(). Useful to cause kdump
|
||||||
on a WARN().
|
on a WARN().
|
||||||
@@ -3394,6 +3472,8 @@
|
|||||||
bridges without forcing it upstream. Note:
|
bridges without forcing it upstream. Note:
|
||||||
this removes isolation between devices and
|
this removes isolation between devices and
|
||||||
may put more devices in an IOMMU group.
|
may put more devices in an IOMMU group.
|
||||||
|
force_floating [S390] Force usage of floating interrupts.
|
||||||
|
nomio [S390] Do not use MIO instructions.
|
||||||
|
|
||||||
pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
|
pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
|
||||||
Management.
|
Management.
|
||||||
@@ -3623,7 +3703,9 @@
|
|||||||
see CONFIG_RAS_CEC help text.
|
see CONFIG_RAS_CEC help text.
|
||||||
|
|
||||||
rcu_nocbs= [KNL]
|
rcu_nocbs= [KNL]
|
||||||
The argument is a cpu list, as described above.
|
The argument is a cpu list, as described above,
|
||||||
|
except that the string "all" can be used to
|
||||||
|
specify every CPU on the system.
|
||||||
|
|
||||||
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
In kernels built with CONFIG_RCU_NOCB_CPU=y, set
|
||||||
the specified list of CPUs to be no-callback CPUs.
|
the specified list of CPUs to be no-callback CPUs.
|
||||||
@@ -3986,7 +4068,9 @@
|
|||||||
[[,]s[mp]#### \
|
[[,]s[mp]#### \
|
||||||
[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
|
[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
|
||||||
[[,]f[orce]
|
[[,]f[orce]
|
||||||
Where reboot_mode is one of warm (soft) or cold (hard) or gpio,
|
Where reboot_mode is one of warm (soft) or cold (hard) or gpio
|
||||||
|
(prefix with 'panic_' to set mode for panic
|
||||||
|
reboot only),
|
||||||
reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
|
reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
|
||||||
reboot_force is either force or not specified,
|
reboot_force is either force or not specified,
|
||||||
reboot_cpu is s[mp]#### with #### being the processor
|
reboot_cpu is s[mp]#### with #### being the processor
|
||||||
@@ -4703,6 +4787,10 @@
|
|||||||
[x86] unstable: mark the TSC clocksource as unstable, this
|
[x86] unstable: mark the TSC clocksource as unstable, this
|
||||||
marks the TSC unconditionally unstable at bootup and
|
marks the TSC unconditionally unstable at bootup and
|
||||||
avoids any further wobbles once the TSC watchdog notices.
|
avoids any further wobbles once the TSC watchdog notices.
|
||||||
|
[x86] nowatchdog: disable clocksource watchdog. Used
|
||||||
|
in situations with strict latency requirements (where
|
||||||
|
interruptions from clocksource watchdog are not
|
||||||
|
acceptable).
|
||||||
|
|
||||||
turbografx.map[2|3]= [HW,JOY]
|
turbografx.map[2|3]= [HW,JOY]
|
||||||
TurboGraFX parallel port interface
|
TurboGraFX parallel port interface
|
||||||
@@ -5173,6 +5261,13 @@
|
|||||||
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
||||||
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
||||||
|
|
||||||
|
xen_timer_slop= [X86-64,XEN]
|
||||||
|
Set the timer slop (in nanoseconds) for the virtual Xen
|
||||||
|
timers (default is 100000). This adjusts the minimum
|
||||||
|
delta of virtualized Xen timers, where lower values
|
||||||
|
improve timer resolution at the expense of processing
|
||||||
|
more timer interrupts.
|
||||||
|
|
||||||
xirc2ps_cs= [NET,PCMCIA]
|
xirc2ps_cs= [NET,PCMCIA]
|
||||||
Format:
|
Format:
|
||||||
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
||||||
|
|||||||
@@ -1,614 +0,0 @@
|
|||||||
L1TF - L1 Terminal Fault
|
|
||||||
========================
|
|
||||||
|
|
||||||
L1 Terminal Fault is a hardware vulnerability which allows unprivileged
|
|
||||||
speculative access to data which is available in the Level 1 Data Cache
|
|
||||||
when the page table entry controlling the virtual address, which is used
|
|
||||||
for the access, has the Present bit cleared or other reserved bits set.
|
|
||||||
|
|
||||||
Affected processors
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
This vulnerability affects a wide range of Intel processors. The
|
|
||||||
vulnerability is not present on:
|
|
||||||
|
|
||||||
- Processors from AMD, Centaur and other non Intel vendors
|
|
||||||
|
|
||||||
- Older processor models, where the CPU family is < 6
|
|
||||||
|
|
||||||
- A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
|
|
||||||
Penwell, Pineview, Silvermont, Airmont, Merrifield)
|
|
||||||
|
|
||||||
- The Intel XEON PHI family
|
|
||||||
|
|
||||||
- Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
|
|
||||||
IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
|
|
||||||
by the Meltdown vulnerability either. These CPUs should become
|
|
||||||
available by end of 2018.
|
|
||||||
|
|
||||||
Whether a processor is affected or not can be read out from the L1TF
|
|
||||||
vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
|
|
||||||
|
|
||||||
Related CVEs
|
|
||||||
------------
|
|
||||||
|
|
||||||
The following CVE entries are related to the L1TF vulnerability:
|
|
||||||
|
|
||||||
============= ================= ==============================
|
|
||||||
CVE-2018-3615 L1 Terminal Fault SGX related aspects
|
|
||||||
CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
|
|
||||||
CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
|
|
||||||
============= ================= ==============================
|
|
||||||
|
|
||||||
Problem
|
|
||||||
-------
|
|
||||||
|
|
||||||
If an instruction accesses a virtual address for which the relevant page
|
|
||||||
table entry (PTE) has the Present bit cleared or other reserved bits set,
|
|
||||||
then speculative execution ignores the invalid PTE and loads the referenced
|
|
||||||
data if it is present in the Level 1 Data Cache, as if the page referenced
|
|
||||||
by the address bits in the PTE was still present and accessible.
|
|
||||||
|
|
||||||
While this is a purely speculative mechanism and the instruction will raise
|
|
||||||
a page fault when it is retired eventually, the pure act of loading the
|
|
||||||
data and making it available to other speculative instructions opens up the
|
|
||||||
opportunity for side channel attacks to unprivileged malicious code,
|
|
||||||
similar to the Meltdown attack.
|
|
||||||
|
|
||||||
While Meltdown breaks the user space to kernel space protection, L1TF
|
|
||||||
allows to attack any physical memory address in the system and the attack
|
|
||||||
works across all protection domains. It allows an attack of SGX and also
|
|
||||||
works from inside virtual machines because the speculation bypasses the
|
|
||||||
extended page table (EPT) protection mechanism.
|
|
||||||
|
|
||||||
|
|
||||||
Attack scenarios
|
|
||||||
----------------
|
|
||||||
|
|
||||||
1. Malicious user space
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Operating Systems store arbitrary information in the address bits of a
|
|
||||||
PTE which is marked non present. This allows a malicious user space
|
|
||||||
application to attack the physical memory to which these PTEs resolve.
|
|
||||||
In some cases user-space can maliciously influence the information
|
|
||||||
encoded in the address bits of the PTE, thus making attacks more
|
|
||||||
deterministic and more practical.
|
|
||||||
|
|
||||||
The Linux kernel contains a mitigation for this attack vector, PTE
|
|
||||||
inversion, which is permanently enabled and has no performance
|
|
||||||
impact. The kernel ensures that the address bits of PTEs, which are not
|
|
||||||
marked present, never point to cacheable physical memory space.
|
|
||||||
|
|
||||||
A system with an up to date kernel is protected against attacks from
|
|
||||||
malicious user space applications.
|
|
||||||
|
|
||||||
2. Malicious guest in a virtual machine
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
The fact that L1TF breaks all domain protections allows malicious guest
|
|
||||||
OSes, which can control the PTEs directly, and malicious guest user
|
|
||||||
space applications, which run on an unprotected guest kernel lacking the
|
|
||||||
PTE inversion mitigation for L1TF, to attack physical host memory.
|
|
||||||
|
|
||||||
A special aspect of L1TF in the context of virtualization is symmetric
|
|
||||||
multi threading (SMT). The Intel implementation of SMT is called
|
|
||||||
HyperThreading. The fact that Hyperthreads on the affected processors
|
|
||||||
share the L1 Data Cache (L1D) is important for this. As the flaw allows
|
|
||||||
only to attack data which is present in L1D, a malicious guest running
|
|
||||||
on one Hyperthread can attack the data which is brought into the L1D by
|
|
||||||
the context which runs on the sibling Hyperthread of the same physical
|
|
||||||
core. This context can be host OS, host user space or a different guest.
|
|
||||||
|
|
||||||
If the processor does not support Extended Page Tables, the attack is
|
|
||||||
only possible, when the hypervisor does not sanitize the content of the
|
|
||||||
effective (shadow) page tables.
|
|
||||||
|
|
||||||
While solutions exist to mitigate these attack vectors fully, these
|
|
||||||
mitigations are not enabled by default in the Linux kernel because they
|
|
||||||
can affect performance significantly. The kernel provides several
|
|
||||||
mechanisms which can be utilized to address the problem depending on the
|
|
||||||
deployment scenario. The mitigations, their protection scope and impact
|
|
||||||
are described in the next sections.
|
|
||||||
|
|
||||||
The default mitigations and the rationale for choosing them are explained
|
|
||||||
at the end of this document. See :ref:`default_mitigations`.
|
|
||||||
|
|
||||||
.. _l1tf_sys_info:
|
|
||||||
|
|
||||||
L1TF system information
|
|
||||||
-----------------------
|
|
||||||
|
|
||||||
The Linux kernel provides a sysfs interface to enumerate the current L1TF
|
|
||||||
status of the system: whether the system is vulnerable, and which
|
|
||||||
mitigations are active. The relevant sysfs file is:
|
|
||||||
|
|
||||||
/sys/devices/system/cpu/vulnerabilities/l1tf
|
|
||||||
|
|
||||||
The possible values in this file are:
|
|
||||||
|
|
||||||
=========================== ===============================
|
|
||||||
'Not affected' The processor is not vulnerable
|
|
||||||
'Mitigation: PTE Inversion' The host protection is active
|
|
||||||
=========================== ===============================
|
|
||||||
|
|
||||||
If KVM/VMX is enabled and the processor is vulnerable then the following
|
|
||||||
information is appended to the 'Mitigation: PTE Inversion' part:
|
|
||||||
|
|
||||||
- SMT status:
|
|
||||||
|
|
||||||
===================== ================
|
|
||||||
'VMX: SMT vulnerable' SMT is enabled
|
|
||||||
'VMX: SMT disabled' SMT is disabled
|
|
||||||
===================== ================
|
|
||||||
|
|
||||||
- L1D Flush mode:
|
|
||||||
|
|
||||||
================================ ====================================
|
|
||||||
'L1D vulnerable' L1D flushing is disabled
|
|
||||||
|
|
||||||
'L1D conditional cache flushes' L1D flush is conditionally enabled
|
|
||||||
|
|
||||||
'L1D cache flushes' L1D flush is unconditionally enabled
|
|
||||||
================================ ====================================
|
|
||||||
|
|
||||||
The resulting grade of protection is discussed in the following sections.
|
|
||||||
|
|
||||||
|
|
||||||
Host mitigation mechanism
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
The kernel is unconditionally protected against L1TF attacks from malicious
|
|
||||||
user space running on the host.
|
|
||||||
|
|
||||||
|
|
||||||
Guest mitigation mechanisms
|
|
||||||
---------------------------
|
|
||||||
|
|
||||||
.. _l1d_flush:
|
|
||||||
|
|
||||||
1. L1D flush on VMENTER
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
To make sure that a guest cannot attack data which is present in the L1D
|
|
||||||
the hypervisor flushes the L1D before entering the guest.
|
|
||||||
|
|
||||||
Flushing the L1D evicts not only the data which should not be accessed
|
|
||||||
by a potentially malicious guest, it also flushes the guest
|
|
||||||
data. Flushing the L1D has a performance impact as the processor has to
|
|
||||||
bring the flushed guest data back into the L1D. Depending on the
|
|
||||||
frequency of VMEXIT/VMENTER and the type of computations in the guest
|
|
||||||
performance degradation in the range of 1% to 50% has been observed. For
|
|
||||||
scenarios where guest VMEXIT/VMENTER are rare the performance impact is
|
|
||||||
minimal. Virtio and mechanisms like posted interrupts are designed to
|
|
||||||
confine the VMEXITs to a bare minimum, but specific configurations and
|
|
||||||
application scenarios might still suffer from a high VMEXIT rate.
|
|
||||||
|
|
||||||
The kernel provides two L1D flush modes:
|
|
||||||
- conditional ('cond')
|
|
||||||
- unconditional ('always')
|
|
||||||
|
|
||||||
The conditional mode avoids L1D flushing after VMEXITs which execute
|
|
||||||
only audited code paths before the corresponding VMENTER. These code
|
|
||||||
paths have been verified that they cannot expose secrets or other
|
|
||||||
interesting data to an attacker, but they can leak information about the
|
|
||||||
address space layout of the hypervisor.
|
|
||||||
|
|
||||||
Unconditional mode flushes L1D on all VMENTER invocations and provides
|
|
||||||
maximum protection. It has a higher overhead than the conditional
|
|
||||||
mode. The overhead cannot be quantified correctly as it depends on the
|
|
||||||
workload scenario and the resulting number of VMEXITs.
|
|
||||||
|
|
||||||
The general recommendation is to enable L1D flush on VMENTER. The kernel
|
|
||||||
defaults to conditional mode on affected processors.
|
|
||||||
|
|
||||||
**Note**, that L1D flush does not prevent the SMT problem because the
|
|
||||||
sibling thread will also bring back its data into the L1D which makes it
|
|
||||||
attackable again.
|
|
||||||
|
|
||||||
L1D flush can be controlled by the administrator via the kernel command
|
|
||||||
line and sysfs control files. See :ref:`mitigation_control_command_line`
|
|
||||||
and :ref:`mitigation_control_kvm`.
|
|
||||||
|
|
||||||
.. _guest_confinement:
|
|
||||||
|
|
||||||
2. Guest VCPU confinement to dedicated physical cores
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
To address the SMT problem, it is possible to make a guest or a group of
|
|
||||||
guests affine to one or more physical cores. The proper mechanism for
|
|
||||||
that is to utilize exclusive cpusets to ensure that no other guest or
|
|
||||||
host tasks can run on these cores.
|
|
||||||
|
|
||||||
If only a single guest or related guests run on sibling SMT threads on
|
|
||||||
the same physical core then they can only attack their own memory and
|
|
||||||
restricted parts of the host memory.
|
|
||||||
|
|
||||||
Host memory is attackable, when one of the sibling SMT threads runs in
|
|
||||||
host OS (hypervisor) context and the other in guest context. The amount
|
|
||||||
of valuable information from the host OS context depends on the context
|
|
||||||
which the host OS executes, i.e. interrupts, soft interrupts and kernel
|
|
||||||
threads. The amount of valuable data from these contexts cannot be
|
|
||||||
declared as non-interesting for an attacker without deep inspection of
|
|
||||||
the code.
|
|
||||||
|
|
||||||
**Note**, that assigning guests to a fixed set of physical cores affects
|
|
||||||
the ability of the scheduler to do load balancing and might have
|
|
||||||
negative effects on CPU utilization depending on the hosting
|
|
||||||
scenario. Disabling SMT might be a viable alternative for particular
|
|
||||||
scenarios.
|
|
||||||
|
|
||||||
For further information about confining guests to a single or to a group
|
|
||||||
of cores consult the cpusets documentation:
|
|
||||||
|
|
||||||
https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
|
|
||||||
|
|
||||||
.. _interrupt_isolation:
|
|
||||||
|
|
||||||
3. Interrupt affinity
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Interrupts can be made affine to logical CPUs. This is not universally
|
|
||||||
true because there are types of interrupts which are truly per CPU
|
|
||||||
interrupts, e.g. the local timer interrupt. Aside of that multi queue
|
|
||||||
devices affine their interrupts to single CPUs or groups of CPUs per
|
|
||||||
queue without allowing the administrator to control the affinities.
|
|
||||||
|
|
||||||
Moving the interrupts, which can be affinity controlled, away from CPUs
|
|
||||||
which run untrusted guests, reduces the attack vector space.
|
|
||||||
|
|
||||||
Whether the interrupts with are affine to CPUs, which run untrusted
|
|
||||||
guests, provide interesting data for an attacker depends on the system
|
|
||||||
configuration and the scenarios which run on the system. While for some
|
|
||||||
of the interrupts it can be assumed that they won't expose interesting
|
|
||||||
information beyond exposing hints about the host OS memory layout, there
|
|
||||||
is no way to make general assumptions.
|
|
||||||
|
|
||||||
Interrupt affinity can be controlled by the administrator via the
|
|
||||||
/proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
|
|
||||||
available at:
|
|
||||||
|
|
||||||
https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
|
|
||||||
|
|
||||||
.. _smt_control:
|
|
||||||
|
|
||||||
4. SMT control
|
|
||||||
^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
To prevent the SMT issues of L1TF it might be necessary to disable SMT
|
|
||||||
completely. Disabling SMT can have a significant performance impact, but
|
|
||||||
the impact depends on the hosting scenario and the type of workloads.
|
|
||||||
The impact of disabling SMT needs also to be weighted against the impact
|
|
||||||
of other mitigation solutions like confining guests to dedicated cores.
|
|
||||||
|
|
||||||
The kernel provides a sysfs interface to retrieve the status of SMT and
|
|
||||||
to control it. It also provides a kernel command line interface to
|
|
||||||
control SMT.
|
|
||||||
|
|
||||||
The kernel command line interface consists of the following options:
|
|
||||||
|
|
||||||
=========== ==========================================================
|
|
||||||
nosmt Affects the bring up of the secondary CPUs during boot. The
|
|
||||||
kernel tries to bring all present CPUs online during the
|
|
||||||
boot process. "nosmt" makes sure that from each physical
|
|
||||||
core only one - the so called primary (hyper) thread is
|
|
||||||
activated. Due to a design flaw of Intel processors related
|
|
||||||
to Machine Check Exceptions the non primary siblings have
|
|
||||||
to be brought up at least partially and are then shut down
|
|
||||||
again. "nosmt" can be undone via the sysfs interface.
|
|
||||||
|
|
||||||
nosmt=force Has the same effect as "nosmt" but it does not allow to
|
|
||||||
undo the SMT disable via the sysfs interface.
|
|
||||||
=========== ==========================================================
|
|
||||||
|
|
||||||
The sysfs interface provides two files:
|
|
||||||
|
|
||||||
- /sys/devices/system/cpu/smt/control
|
|
||||||
- /sys/devices/system/cpu/smt/active
|
|
||||||
|
|
||||||
/sys/devices/system/cpu/smt/control:
|
|
||||||
|
|
||||||
This file allows to read out the SMT control state and provides the
|
|
||||||
ability to disable or (re)enable SMT. The possible states are:
|
|
||||||
|
|
||||||
============== ===================================================
|
|
||||||
on SMT is supported by the CPU and enabled. All
|
|
||||||
logical CPUs can be onlined and offlined without
|
|
||||||
restrictions.
|
|
||||||
|
|
||||||
off SMT is supported by the CPU and disabled. Only
|
|
||||||
the so called primary SMT threads can be onlined
|
|
||||||
and offlined without restrictions. An attempt to
|
|
||||||
online a non-primary sibling is rejected
|
|
||||||
|
|
||||||
forceoff Same as 'off' but the state cannot be controlled.
|
|
||||||
Attempts to write to the control file are rejected.
|
|
||||||
|
|
||||||
notsupported The processor does not support SMT. It's therefore
|
|
||||||
not affected by the SMT implications of L1TF.
|
|
||||||
Attempts to write to the control file are rejected.
|
|
||||||
============== ===================================================
|
|
||||||
|
|
||||||
The possible states which can be written into this file to control SMT
|
|
||||||
state are:
|
|
||||||
|
|
||||||
- on
|
|
||||||
- off
|
|
||||||
- forceoff
|
|
||||||
|
|
||||||
/sys/devices/system/cpu/smt/active:
|
|
||||||
|
|
||||||
This file reports whether SMT is enabled and active, i.e. if on any
|
|
||||||
physical core two or more sibling threads are online.
|
|
||||||
|
|
||||||
SMT control is also possible at boot time via the l1tf kernel command
|
|
||||||
line parameter in combination with L1D flush control. See
|
|
||||||
:ref:`mitigation_control_command_line`.
|
|
||||||
|
|
||||||
5. Disabling EPT
|
|
||||||
^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
Disabling EPT for virtual machines provides full mitigation for L1TF even
|
|
||||||
with SMT enabled, because the effective page tables for guests are
|
|
||||||
managed and sanitized by the hypervisor. Though disabling EPT has a
|
|
||||||
significant performance impact especially when the Meltdown mitigation
|
|
||||||
KPTI is enabled.
|
|
||||||
|
|
||||||
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
|
||||||
|
|
||||||
There is ongoing research and development for new mitigation mechanisms to
|
|
||||||
address the performance impact of disabling SMT or EPT.
|
|
||||||
|
|
||||||
.. _mitigation_control_command_line:
|
|
||||||
|
|
||||||
Mitigation control on the kernel command line
|
|
||||||
---------------------------------------------
|
|
||||||
|
|
||||||
The kernel command line allows to control the L1TF mitigations at boot
|
|
||||||
time with the option "l1tf=". The valid arguments for this option are:
|
|
||||||
|
|
||||||
============ =============================================================
|
|
||||||
full Provides all available mitigations for the L1TF
|
|
||||||
vulnerability. Disables SMT and enables all mitigations in
|
|
||||||
the hypervisors, i.e. unconditional L1D flushing
|
|
||||||
|
|
||||||
SMT control and L1D flush control via the sysfs interface
|
|
||||||
is still possible after boot. Hypervisors will issue a
|
|
||||||
warning when the first VM is started in a potentially
|
|
||||||
insecure configuration, i.e. SMT enabled or L1D flush
|
|
||||||
disabled.
|
|
||||||
|
|
||||||
full,force Same as 'full', but disables SMT and L1D flush runtime
|
|
||||||
control. Implies the 'nosmt=force' command line option.
|
|
||||||
(i.e. sysfs control of SMT is disabled.)
|
|
||||||
|
|
||||||
flush Leaves SMT enabled and enables the default hypervisor
|
|
||||||
mitigation, i.e. conditional L1D flushing
|
|
||||||
|
|
||||||
SMT control and L1D flush control via the sysfs interface
|
|
||||||
is still possible after boot. Hypervisors will issue a
|
|
||||||
warning when the first VM is started in a potentially
|
|
||||||
insecure configuration, i.e. SMT enabled or L1D flush
|
|
||||||
disabled.
|
|
||||||
|
|
||||||
flush,nosmt Disables SMT and enables the default hypervisor mitigation,
|
|
||||||
i.e. conditional L1D flushing.
|
|
||||||
|
|
||||||
SMT control and L1D flush control via the sysfs interface
|
|
||||||
is still possible after boot. Hypervisors will issue a
|
|
||||||
warning when the first VM is started in a potentially
|
|
||||||
insecure configuration, i.e. SMT enabled or L1D flush
|
|
||||||
disabled.
|
|
||||||
|
|
||||||
flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
|
|
||||||
started in a potentially insecure configuration.
|
|
||||||
|
|
||||||
off Disables hypervisor mitigations and doesn't emit any
|
|
||||||
warnings.
|
|
||||||
It also drops the swap size and available RAM limit restrictions
|
|
||||||
on both hypervisor and bare metal.
|
|
||||||
|
|
||||||
============ =============================================================
|
|
||||||
|
|
||||||
The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
|
|
||||||
|
|
||||||
|
|
||||||
.. _mitigation_control_kvm:
|
|
||||||
|
|
||||||
Mitigation control for KVM - module parameter
|
|
||||||
-------------------------------------------------------------
|
|
||||||
|
|
||||||
The KVM hypervisor mitigation mechanism, flushing the L1D cache when
|
|
||||||
entering a guest, can be controlled with a module parameter.
|
|
||||||
|
|
||||||
The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
|
|
||||||
following arguments:
|
|
||||||
|
|
||||||
============ ==============================================================
|
|
||||||
always L1D cache flush on every VMENTER.
|
|
||||||
|
|
||||||
cond Flush L1D on VMENTER only when the code between VMEXIT and
|
|
||||||
VMENTER can leak host memory which is considered
|
|
||||||
interesting for an attacker. This still can leak host memory
|
|
||||||
which allows e.g. to determine the hosts address space layout.
|
|
||||||
|
|
||||||
never Disables the mitigation
|
|
||||||
============ ==============================================================
|
|
||||||
|
|
||||||
The parameter can be provided on the kernel command line, as a module
|
|
||||||
parameter when loading the modules and at runtime modified via the sysfs
|
|
||||||
file:
|
|
||||||
|
|
||||||
/sys/module/kvm_intel/parameters/vmentry_l1d_flush
|
|
||||||
|
|
||||||
The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
|
|
||||||
line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
|
||||||
module parameter is ignored and writes to the sysfs file are rejected.
|
|
||||||
|
|
||||||
|
|
||||||
Mitigation selection guide
|
|
||||||
--------------------------
|
|
||||||
|
|
||||||
1. No virtualization in use
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
The system is protected by the kernel unconditionally and no further
|
|
||||||
action is required.
|
|
||||||
|
|
||||||
2. Virtualization with trusted guests
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
If the guest comes from a trusted source and the guest OS kernel is
|
|
||||||
guaranteed to have the L1TF mitigations in place the system is fully
|
|
||||||
protected against L1TF and no further action is required.
|
|
||||||
|
|
||||||
To avoid the overhead of the default L1D flushing on VMENTER the
|
|
||||||
administrator can disable the flushing via the kernel command line and
|
|
||||||
sysfs control files. See :ref:`mitigation_control_command_line` and
|
|
||||||
:ref:`mitigation_control_kvm`.
|
|
||||||
|
|
||||||
|
|
||||||
3. Virtualization with untrusted guests
|
|
||||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
3.1. SMT not supported or disabled
|
|
||||||
""""""""""""""""""""""""""""""""""
|
|
||||||
|
|
||||||
If SMT is not supported by the processor or disabled in the BIOS or by
|
|
||||||
the kernel, it's only required to enforce L1D flushing on VMENTER.
|
|
||||||
|
|
||||||
Conditional L1D flushing is the default behaviour and can be tuned. See
|
|
||||||
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
|
||||||
|
|
||||||
3.2. EPT not supported or disabled
|
|
||||||
""""""""""""""""""""""""""""""""""
|
|
||||||
|
|
||||||
If EPT is not supported by the processor or disabled in the hypervisor,
|
|
||||||
the system is fully protected. SMT can stay enabled and L1D flushing on
|
|
||||||
VMENTER is not required.
|
|
||||||
|
|
||||||
EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
|
||||||
|
|
||||||
3.3. SMT and EPT supported and active
|
|
||||||
"""""""""""""""""""""""""""""""""""""
|
|
||||||
|
|
||||||
If SMT and EPT are supported and active then various degrees of
|
|
||||||
mitigations can be employed:
|
|
||||||
|
|
||||||
- L1D flushing on VMENTER:
|
|
||||||
|
|
||||||
L1D flushing on VMENTER is the minimal protection requirement, but it
|
|
||||||
is only potent in combination with other mitigation methods.
|
|
||||||
|
|
||||||
Conditional L1D flushing is the default behaviour and can be tuned. See
|
|
||||||
:ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
|
||||||
|
|
||||||
- Guest confinement:
|
|
||||||
|
|
||||||
Confinement of guests to a single or a group of physical cores which
|
|
||||||
are not running any other processes, can reduce the attack surface
|
|
||||||
significantly, but interrupts, soft interrupts and kernel threads can
|
|
||||||
still expose valuable data to a potential attacker. See
|
|
||||||
:ref:`guest_confinement`.
|
|
||||||
|
|
||||||
- Interrupt isolation:
|
|
||||||
|
|
||||||
Isolating the guest CPUs from interrupts can reduce the attack surface
|
|
||||||
further, but still allows a malicious guest to explore a limited amount
|
|
||||||
of host physical memory. This can at least be used to gain knowledge
|
|
||||||
about the host address space layout. The interrupts which have a fixed
|
|
||||||
affinity to the CPUs which run the untrusted guests can depending on
|
|
||||||
the scenario still trigger soft interrupts and schedule kernel threads
|
|
||||||
which might expose valuable information. See
|
|
||||||
:ref:`interrupt_isolation`.
|
|
||||||
|
|
||||||
The above three mitigation methods combined can provide protection to a
|
|
||||||
certain degree, but the risk of the remaining attack surface has to be
|
|
||||||
carefully analyzed. For full protection the following methods are
|
|
||||||
available:
|
|
||||||
|
|
||||||
- Disabling SMT:
|
|
||||||
|
|
||||||
Disabling SMT and enforcing the L1D flushing provides the maximum
|
|
||||||
amount of protection. This mitigation is not depending on any of the
|
|
||||||
above mitigation methods.
|
|
||||||
|
|
||||||
SMT control and L1D flushing can be tuned by the command line
|
|
||||||
parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
|
|
||||||
time with the matching sysfs control files. See :ref:`smt_control`,
|
|
||||||
:ref:`mitigation_control_command_line` and
|
|
||||||
:ref:`mitigation_control_kvm`.
|
|
||||||
|
|
||||||
- Disabling EPT:
|
|
||||||
|
|
||||||
Disabling EPT provides the maximum amount of protection as well. It is
|
|
||||||
not depending on any of the above mitigation methods. SMT can stay
|
|
||||||
enabled and L1D flushing is not required, but the performance impact is
|
|
||||||
significant.
|
|
||||||
|
|
||||||
EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
|
|
||||||
parameter.
|
|
||||||
|
|
||||||
3.4. Nested virtual machines
|
|
||||||
""""""""""""""""""""""""""""
|
|
||||||
|
|
||||||
When nested virtualization is in use, three operating systems are involved:
|
|
||||||
the bare metal hypervisor, the nested hypervisor and the nested virtual
|
|
||||||
machine. VMENTER operations from the nested hypervisor into the nested
|
|
||||||
guest will always be processed by the bare metal hypervisor. If KVM is the
|
|
||||||
bare metal hypervisor it will:
|
|
||||||
|
|
||||||
- Flush the L1D cache on every switch from the nested hypervisor to the
|
|
||||||
nested virtual machine, so that the nested hypervisor's secrets are not
|
|
||||||
exposed to the nested virtual machine;
|
|
||||||
|
|
||||||
- Flush the L1D cache on every switch from the nested virtual machine to
|
|
||||||
the nested hypervisor; this is a complex operation, and flushing the L1D
|
|
||||||
cache avoids that the bare metal hypervisor's secrets are exposed to the
|
|
||||||
nested virtual machine;
|
|
||||||
|
|
||||||
- Instruct the nested hypervisor to not perform any L1D cache flush. This
|
|
||||||
is an optimization to avoid double L1D flushing.
|
|
||||||
|
|
||||||
|
|
||||||
.. _default_mitigations:
|
|
||||||
|
|
||||||
Default mitigations
|
|
||||||
-------------------
|
|
||||||
|
|
||||||
The kernel default mitigations for vulnerable processors are:
|
|
||||||
|
|
||||||
- PTE inversion to protect against malicious user space. This is done
|
|
||||||
unconditionally and cannot be controlled. The swap storage is limited
|
|
||||||
to ~16TB.
|
|
||||||
|
|
||||||
- L1D conditional flushing on VMENTER when EPT is enabled for
|
|
||||||
a guest.
|
|
||||||
|
|
||||||
The kernel does not by default enforce the disabling of SMT, which leaves
|
|
||||||
SMT systems vulnerable when running untrusted guests with EPT enabled.
|
|
||||||
|
|
||||||
The rationale for this choice is:
|
|
||||||
|
|
||||||
- Force disabling SMT can break existing setups, especially with
|
|
||||||
unattended updates.
|
|
||||||
|
|
||||||
- If regular users run untrusted guests on their machine, then L1TF is
|
|
||||||
just an add on to other malware which might be embedded in an untrusted
|
|
||||||
guest, e.g. spam-bots or attacks on the local network.
|
|
||||||
|
|
||||||
There is no technical way to prevent a user from running untrusted code
|
|
||||||
on their machines blindly.
|
|
||||||
|
|
||||||
- It's technically extremely unlikely and from today's knowledge even
|
|
||||||
impossible that L1TF can be exploited via the most popular attack
|
|
||||||
mechanisms like JavaScript because these mechanisms have no way to
|
|
||||||
control PTEs. If this would be possible and not other mitigation would
|
|
||||||
be possible, then the default might be different.
|
|
||||||
|
|
||||||
- The administrators of cloud and hosting setups have to carefully
|
|
||||||
analyze the risk for their scenarios and make the appropriate
|
|
||||||
mitigation choices, which might even vary across their deployed
|
|
||||||
machines and also result in other changes of their overall setup.
|
|
||||||
There is no way for the kernel to provide a sensible default for this
|
|
||||||
kind of scenarios.
|
|
||||||
169
Documentation/admin-guide/mm/numaperf.rst
Normal file
169
Documentation/admin-guide/mm/numaperf.rst
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
.. _numaperf:
|
||||||
|
|
||||||
|
=============
|
||||||
|
NUMA Locality
|
||||||
|
=============
|
||||||
|
|
||||||
|
Some platforms may have multiple types of memory attached to a compute
|
||||||
|
node. These disparate memory ranges may share some characteristics, such
|
||||||
|
as CPU cache coherence, but may have different performance. For example,
|
||||||
|
different media types and buses affect bandwidth and latency.
|
||||||
|
|
||||||
|
A system supports such heterogeneous memory by grouping each memory type
|
||||||
|
under different domains, or "nodes", based on locality and performance
|
||||||
|
characteristics. Some memory may share the same node as a CPU, and others
|
||||||
|
are provided as memory only nodes. While memory only nodes do not provide
|
||||||
|
CPUs, they may still be local to one or more compute nodes relative to
|
||||||
|
other nodes. The following diagram shows one such example of two compute
|
||||||
|
nodes with local memory and a memory only node for each of compute node:
|
||||||
|
|
||||||
|
+------------------+ +------------------+
|
||||||
|
| Compute Node 0 +-----+ Compute Node 1 |
|
||||||
|
| Local Node0 Mem | | Local Node1 Mem |
|
||||||
|
+--------+---------+ +--------+---------+
|
||||||
|
| |
|
||||||
|
+--------+---------+ +--------+---------+
|
||||||
|
| Slower Node2 Mem | | Slower Node3 Mem |
|
||||||
|
+------------------+ +--------+---------+
|
||||||
|
|
||||||
|
A "memory initiator" is a node containing one or more devices such as
|
||||||
|
CPUs or separate memory I/O devices that can initiate memory requests.
|
||||||
|
A "memory target" is a node containing one or more physical address
|
||||||
|
ranges accessible from one or more memory initiators.
|
||||||
|
|
||||||
|
When multiple memory initiators exist, they may not all have the same
|
||||||
|
performance when accessing a given memory target. Each initiator-target
|
||||||
|
pair may be organized into different ranked access classes to represent
|
||||||
|
this relationship. The highest performing initiator to a given target
|
||||||
|
is considered to be one of that target's local initiators, and given
|
||||||
|
the highest access class, 0. Any given target may have one or more
|
||||||
|
local initiators, and any given initiator may have multiple local
|
||||||
|
memory targets.
|
||||||
|
|
||||||
|
To aid applications matching memory targets with their initiators, the
|
||||||
|
kernel provides symlinks to each other. The following example lists the
|
||||||
|
relationship for the access class "0" memory initiators and targets::
|
||||||
|
|
||||||
|
# symlinks -v /sys/devices/system/node/nodeX/access0/targets/
|
||||||
|
relative: /sys/devices/system/node/nodeX/access0/targets/nodeY -> ../../nodeY
|
||||||
|
|
||||||
|
# symlinks -v /sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
relative: /sys/devices/system/node/nodeY/access0/initiators/nodeX -> ../../nodeX
|
||||||
|
|
||||||
|
A memory initiator may have multiple memory targets in the same access
|
||||||
|
class. The target memory's initiators in a given class indicate the
|
||||||
|
nodes' access characteristics share the same performance relative to other
|
||||||
|
linked initiator nodes. Each target within an initiator's access class,
|
||||||
|
though, do not necessarily perform the same as each other.
|
||||||
|
|
||||||
|
================
|
||||||
|
NUMA Performance
|
||||||
|
================
|
||||||
|
|
||||||
|
Applications may wish to consider which node they want their memory to
|
||||||
|
be allocated from based on the node's performance characteristics. If
|
||||||
|
the system provides these attributes, the kernel exports them under the
|
||||||
|
node sysfs hierarchy by appending the attributes directory under the
|
||||||
|
memory node's access class 0 initiators as follows::
|
||||||
|
|
||||||
|
/sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
|
||||||
|
These attributes apply only when accessed from nodes that have the
|
||||||
|
are linked under the this access's inititiators.
|
||||||
|
|
||||||
|
The performance characteristics the kernel provides for the local initiators
|
||||||
|
are exported are as follows::
|
||||||
|
|
||||||
|
# tree -P "read*|write*" /sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
/sys/devices/system/node/nodeY/access0/initiators/
|
||||||
|
|-- read_bandwidth
|
||||||
|
|-- read_latency
|
||||||
|
|-- write_bandwidth
|
||||||
|
`-- write_latency
|
||||||
|
|
||||||
|
The bandwidth attributes are provided in MiB/second.
|
||||||
|
|
||||||
|
The latency attributes are provided in nanoseconds.
|
||||||
|
|
||||||
|
The values reported here correspond to the rated latency and bandwidth
|
||||||
|
for the platform.
|
||||||
|
|
||||||
|
==========
|
||||||
|
NUMA Cache
|
||||||
|
==========
|
||||||
|
|
||||||
|
System memory may be constructed in a hierarchy of elements with various
|
||||||
|
performance characteristics in order to provide large address space of
|
||||||
|
slower performing memory cached by a smaller higher performing memory. The
|
||||||
|
system physical addresses memory initiators are aware of are provided
|
||||||
|
by the last memory level in the hierarchy. The system meanwhile uses
|
||||||
|
higher performing memory to transparently cache access to progressively
|
||||||
|
slower levels.
|
||||||
|
|
||||||
|
The term "far memory" is used to denote the last level memory in the
|
||||||
|
hierarchy. Each increasing cache level provides higher performing
|
||||||
|
initiator access, and the term "near memory" represents the fastest
|
||||||
|
cache provided by the system.
|
||||||
|
|
||||||
|
This numbering is different than CPU caches where the cache level (ex:
|
||||||
|
L1, L2, L3) uses the CPU-side view where each increased level is lower
|
||||||
|
performing. In contrast, the memory cache level is centric to the last
|
||||||
|
level memory, so the higher numbered cache level corresponds to memory
|
||||||
|
nearer to the CPU, and further from far memory.
|
||||||
|
|
||||||
|
The memory-side caches are not directly addressable by software. When
|
||||||
|
software accesses a system address, the system will return it from the
|
||||||
|
near memory cache if it is present. If it is not present, the system
|
||||||
|
accesses the next level of memory until there is either a hit in that
|
||||||
|
cache level, or it reaches far memory.
|
||||||
|
|
||||||
|
An application does not need to know about caching attributes in order
|
||||||
|
to use the system. Software may optionally query the memory cache
|
||||||
|
attributes in order to maximize the performance out of such a setup.
|
||||||
|
If the system provides a way for the kernel to discover this information,
|
||||||
|
for example with ACPI HMAT (Heterogeneous Memory Attribute Table),
|
||||||
|
the kernel will append these attributes to the NUMA node memory target.
|
||||||
|
|
||||||
|
When the kernel first registers a memory cache with a node, the kernel
|
||||||
|
will create the following directory::
|
||||||
|
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/
|
||||||
|
|
||||||
|
If that directory is not present, the system either does not not provide
|
||||||
|
a memory-side cache, or that information is not accessible to the kernel.
|
||||||
|
|
||||||
|
The attributes for each level of cache is provided under its cache
|
||||||
|
level index::
|
||||||
|
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/indexA/
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/indexB/
|
||||||
|
/sys/devices/system/node/nodeX/memory_side_cache/indexC/
|
||||||
|
|
||||||
|
Each cache level's directory provides its attributes. For example, the
|
||||||
|
following shows a single cache level and the attributes available for
|
||||||
|
software to query::
|
||||||
|
|
||||||
|
# tree sys/devices/system/node/node0/memory_side_cache/
|
||||||
|
/sys/devices/system/node/node0/memory_side_cache/
|
||||||
|
|-- index1
|
||||||
|
| |-- indexing
|
||||||
|
| |-- line_size
|
||||||
|
| |-- size
|
||||||
|
| `-- write_policy
|
||||||
|
|
||||||
|
The "indexing" will be 0 if it is a direct-mapped cache, and non-zero
|
||||||
|
for any other indexed based, multi-way associativity.
|
||||||
|
|
||||||
|
The "line_size" is the number of bytes accessed from the next cache
|
||||||
|
level on a miss.
|
||||||
|
|
||||||
|
The "size" is the number of bytes provided by this cache level.
|
||||||
|
|
||||||
|
The "write_policy" will be 0 for write-back, and non-zero for
|
||||||
|
write-through caching.
|
||||||
|
|
||||||
|
========
|
||||||
|
See Also
|
||||||
|
========
|
||||||
|
.. [1] https://www.uefi.org/sites/default/files/resources/ACPI_6_2.pdf
|
||||||
|
Section 5.2.27
|
||||||
@@ -1,3 +1,6 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
|
.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy <cpufreq_policy>`
|
||||||
.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
|
.. |intel_pstate| replace:: :doc:`intel_pstate <intel_pstate>`
|
||||||
|
|
||||||
@@ -5,9 +8,10 @@
|
|||||||
CPU Performance Scaling
|
CPU Performance Scaling
|
||||||
=======================
|
=======================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
The Concept of CPU Performance Scaling
|
The Concept of CPU Performance Scaling
|
||||||
======================================
|
======================================
|
||||||
@@ -396,8 +400,8 @@ RT or deadline scheduling classes, the governor will increase the frequency to
|
|||||||
the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn,
|
the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn,
|
||||||
if it is invoked by the CFS scheduling class, the governor will use the
|
if it is invoked by the CFS scheduling class, the governor will use the
|
||||||
Per-Entity Load Tracking (PELT) metric for the root control group of the
|
Per-Entity Load Tracking (PELT) metric for the root control group of the
|
||||||
given CPU as the CPU utilization estimate (see the `Per-entity load tracking`_
|
given CPU as the CPU utilization estimate (see the *Per-entity load tracking*
|
||||||
LWN.net article for a description of the PELT mechanism). Then, the new
|
LWN.net article [1]_ for a description of the PELT mechanism). Then, the new
|
||||||
CPU frequency to apply is computed in accordance with the formula
|
CPU frequency to apply is computed in accordance with the formula
|
||||||
|
|
||||||
f = 1.25 * ``f_0`` * ``util`` / ``max``
|
f = 1.25 * ``f_0`` * ``util`` / ``max``
|
||||||
@@ -698,4 +702,8 @@ hardware feature (e.g. all Intel ones), even if the
|
|||||||
:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set.
|
:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set.
|
||||||
|
|
||||||
|
|
||||||
.. _Per-entity load tracking: https://lwn.net/Articles/531853/
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
.. [1] Jonathan Corbet, *Per-entity load tracking*,
|
||||||
|
https://lwn.net/Articles/531853/
|
||||||
|
|||||||
@@ -1,3 +1,6 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
|
.. |struct cpuidle_state| replace:: :c:type:`struct cpuidle_state <cpuidle_state>`
|
||||||
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
|
.. |cpufreq| replace:: :doc:`CPU Performance Scaling <cpufreq>`
|
||||||
|
|
||||||
@@ -5,9 +8,10 @@
|
|||||||
CPU Idle Time Management
|
CPU Idle Time Management
|
||||||
========================
|
========================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2018 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2018 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
Concepts
|
Concepts
|
||||||
========
|
========
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
================
|
================
|
||||||
Power Management
|
Power Management
|
||||||
================
|
================
|
||||||
|
|||||||
41
Documentation/admin-guide/pm/intel_epb.rst
Normal file
41
Documentation/admin-guide/pm/intel_epb.rst
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
|
======================================
|
||||||
|
Intel Performance and Energy Bias Hint
|
||||||
|
======================================
|
||||||
|
|
||||||
|
:Copyright: |copy| 2019 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
|
||||||
|
.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c
|
||||||
|
:doc: overview
|
||||||
|
|
||||||
|
Intel Performance and Energy Bias Attribute in ``sysfs``
|
||||||
|
========================================================
|
||||||
|
|
||||||
|
The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU
|
||||||
|
can be checked or updated through a ``sysfs`` attribute (file) under
|
||||||
|
:file:`/sys/devices/system/cpu/cpu<N>/power/`, where the CPU number ``<N>``
|
||||||
|
is allocated at the system initialization time:
|
||||||
|
|
||||||
|
``energy_perf_bias``
|
||||||
|
Shows the current EPB value for the CPU in a sliding scale 0 - 15, where
|
||||||
|
a value of 0 corresponds to a hint preference for highest performance
|
||||||
|
and a value of 15 corresponds to the maximum energy savings.
|
||||||
|
|
||||||
|
In order to update the EPB value for the CPU, this attribute can be
|
||||||
|
written to, either with a number in the 0 - 15 sliding scale above, or
|
||||||
|
with one of the strings: "performance", "balance-performance", "normal",
|
||||||
|
"balance-power", "power" that represent values reflected by their
|
||||||
|
meaning.
|
||||||
|
|
||||||
|
This attribute is present for all online CPUs supporting the EPB
|
||||||
|
feature.
|
||||||
|
|
||||||
|
Note that while the EPB interface to the processor is defined at the logical CPU
|
||||||
|
level, the physical register backing it may be shared by multiple CPUs (for
|
||||||
|
example, SMT siblings or cores in one package). For this reason, updating the
|
||||||
|
EPB value for one CPU may cause the EPB values for other CPUs to change.
|
||||||
@@ -1,10 +1,13 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
===============================================
|
===============================================
|
||||||
``intel_pstate`` CPU Performance Scaling Driver
|
``intel_pstate`` CPU Performance Scaling Driver
|
||||||
===============================================
|
===============================================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
|
|
||||||
General Information
|
General Information
|
||||||
@@ -20,11 +23,10 @@ you have not done that yet.]
|
|||||||
|
|
||||||
For the processors supported by ``intel_pstate``, the P-state concept is broader
|
For the processors supported by ``intel_pstate``, the P-state concept is broader
|
||||||
than just an operating frequency or an operating performance point (see the
|
than just an operating frequency or an operating performance point (see the
|
||||||
`LinuxCon Europe 2015 presentation by Kristen Accardi <LCEU2015_>`_ for more
|
LinuxCon Europe 2015 presentation by Kristen Accardi [1]_ for more
|
||||||
information about that). For this reason, the representation of P-states used
|
information about that). For this reason, the representation of P-states used
|
||||||
by ``intel_pstate`` internally follows the hardware specification (for details
|
by ``intel_pstate`` internally follows the hardware specification (for details
|
||||||
refer to `Intel® 64 and IA-32 Architectures Software Developer’s Manual
|
refer to Intel Software Developer’s Manual [2]_). However, the ``CPUFreq`` core
|
||||||
Volume 3: System Programming Guide <SDM_>`_). However, the ``CPUFreq`` core
|
|
||||||
uses frequencies for identifying operating performance points of CPUs and
|
uses frequencies for identifying operating performance points of CPUs and
|
||||||
frequencies are involved in the user space interface exposed by it, so
|
frequencies are involved in the user space interface exposed by it, so
|
||||||
``intel_pstate`` maps its internal representation of P-states to frequencies too
|
``intel_pstate`` maps its internal representation of P-states to frequencies too
|
||||||
@@ -561,9 +563,9 @@ or to pin every task potentially sensitive to them to a specific CPU.]
|
|||||||
|
|
||||||
On the majority of systems supported by ``intel_pstate``, the ACPI tables
|
On the majority of systems supported by ``intel_pstate``, the ACPI tables
|
||||||
provided by the platform firmware contain ``_PSS`` objects returning information
|
provided by the platform firmware contain ``_PSS`` objects returning information
|
||||||
that can be used for CPU performance scaling (refer to the `ACPI specification`_
|
that can be used for CPU performance scaling (refer to the ACPI specification
|
||||||
for details on the ``_PSS`` objects and the format of the information returned
|
[3]_ for details on the ``_PSS`` objects and the format of the information
|
||||||
by them).
|
returned by them).
|
||||||
|
|
||||||
The information returned by the ACPI ``_PSS`` objects is used by the
|
The information returned by the ACPI ``_PSS`` objects is used by the
|
||||||
``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
|
``acpi-cpufreq`` scaling driver. On systems supported by ``intel_pstate``
|
||||||
@@ -728,6 +730,14 @@ P-state is called, the ``ftrace`` filter can be set to to
|
|||||||
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
<idle>-0 [000] ..s. 2537.654843: intel_pstate_set_pstate <-intel_pstate_timer_func
|
||||||
|
|
||||||
|
|
||||||
.. _LCEU2015: http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
References
|
||||||
.. _SDM: http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
==========
|
||||||
.. _ACPI specification: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
|
|
||||||
|
.. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*,
|
||||||
|
http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf
|
||||||
|
|
||||||
|
.. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*,
|
||||||
|
http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html
|
||||||
|
|
||||||
|
.. [3] *Advanced Configuration and Power Interface Specification*,
|
||||||
|
https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
===================
|
===================
|
||||||
System Sleep States
|
System Sleep States
|
||||||
===================
|
===================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
Sleep states are global low-power states of the entire system in which user
|
Sleep states are global low-power states of the entire system in which user
|
||||||
space code cannot be executed and the overall system activity is significantly
|
space code cannot be executed and the overall system activity is significantly
|
||||||
|
|||||||
@@ -1,10 +1,14 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. include:: <isonum.txt>
|
||||||
|
|
||||||
===========================
|
===========================
|
||||||
Power Management Strategies
|
Power Management Strategies
|
||||||
===========================
|
===========================
|
||||||
|
|
||||||
::
|
:Copyright: |copy| 2017 Intel Corporation
|
||||||
|
|
||||||
|
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
||||||
|
|
||||||
Copyright (c) 2017 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
|
|
||||||
|
|
||||||
The Linux kernel supports two major high-level power management strategies.
|
The Linux kernel supports two major high-level power management strategies.
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
============================
|
============================
|
||||||
System-Wide Power Management
|
System-Wide Power Management
|
||||||
============================
|
============================
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
==============================
|
==============================
|
||||||
Working-State Power Management
|
Working-State Power Management
|
||||||
==============================
|
==============================
|
||||||
@@ -8,3 +10,4 @@ Working-State Power Management
|
|||||||
cpuidle
|
cpuidle
|
||||||
cpufreq
|
cpufreq
|
||||||
intel_pstate
|
intel_pstate
|
||||||
|
intel_epb
|
||||||
|
|||||||
@@ -209,6 +209,22 @@ infrastructure:
|
|||||||
| AT | [35-32] | y |
|
| AT | [35-32] | y |
|
||||||
x--------------------------------------------------x
|
x--------------------------------------------------x
|
||||||
|
|
||||||
|
6) ID_AA64ZFR0_EL1 - SVE feature ID register 0
|
||||||
|
|
||||||
|
x--------------------------------------------------x
|
||||||
|
| Name | bits | visible |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| SM4 | [43-40] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| SHA3 | [35-32] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| BitPerm | [19-16] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| AES | [7-4] | y |
|
||||||
|
|--------------------------------------------------|
|
||||||
|
| SVEVer | [3-0] | y |
|
||||||
|
x--------------------------------------------------x
|
||||||
|
|
||||||
Appendix I: Example
|
Appendix I: Example
|
||||||
---------------------------
|
---------------------------
|
||||||
|
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ architected discovery mechanism available to userspace code at EL0. The
|
|||||||
kernel exposes the presence of these features to userspace through a set
|
kernel exposes the presence of these features to userspace through a set
|
||||||
of flags called hwcaps, exposed in the auxilliary vector.
|
of flags called hwcaps, exposed in the auxilliary vector.
|
||||||
|
|
||||||
Userspace software can test for features by acquiring the AT_HWCAP entry
|
Userspace software can test for features by acquiring the AT_HWCAP or
|
||||||
of the auxilliary vector, and testing whether the relevant flags are
|
AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
|
||||||
set, e.g.
|
flags are set, e.g.
|
||||||
|
|
||||||
bool floating_point_is_present(void)
|
bool floating_point_is_present(void)
|
||||||
{
|
{
|
||||||
@@ -135,6 +135,10 @@ HWCAP_DCPOP
|
|||||||
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
|
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_DCPODP
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
|
||||||
|
|
||||||
HWCAP_SHA3
|
HWCAP_SHA3
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
|
Functionality implied by ID_AA64ISAR0_EL1.SHA3 == 0b0001.
|
||||||
@@ -159,6 +163,30 @@ HWCAP_SVE
|
|||||||
|
|
||||||
Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
|
Functionality implied by ID_AA64PFR0_EL1.SVE == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVE2
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.SVEVer == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVEAES
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVEPMULL
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.AES == 0b0010.
|
||||||
|
|
||||||
|
HWCAP2_SVEBITPERM
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.BitPerm == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVESHA3
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.SHA3 == 0b0001.
|
||||||
|
|
||||||
|
HWCAP2_SVESM4
|
||||||
|
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.SM4 == 0b0001.
|
||||||
|
|
||||||
HWCAP_ASIMDFHM
|
HWCAP_ASIMDFHM
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
|
Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0001.
|
||||||
@@ -194,3 +222,10 @@ HWCAP_PACG
|
|||||||
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
Functionality implied by ID_AA64ISAR1_EL1.GPA == 0b0001 or
|
||||||
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
|
||||||
Documentation/arm64/pointer-authentication.txt.
|
Documentation/arm64/pointer-authentication.txt.
|
||||||
|
|
||||||
|
|
||||||
|
4. Unused AT_HWCAP bits
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
For interoperation with userspace, the kernel guarantees that bits 62
|
||||||
|
and 63 of AT_HWCAP will always be returned as 0.
|
||||||
|
|||||||
85
Documentation/arm64/perf.txt
Normal file
85
Documentation/arm64/perf.txt
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
Perf Event Attributes
|
||||||
|
=====================
|
||||||
|
|
||||||
|
Author: Andrew Murray <andrew.murray@arm.com>
|
||||||
|
Date: 2019-03-06
|
||||||
|
|
||||||
|
exclude_user
|
||||||
|
------------
|
||||||
|
|
||||||
|
This attribute excludes userspace.
|
||||||
|
|
||||||
|
Userspace always runs at EL0 and thus this attribute will exclude EL0.
|
||||||
|
|
||||||
|
|
||||||
|
exclude_kernel
|
||||||
|
--------------
|
||||||
|
|
||||||
|
This attribute excludes the kernel.
|
||||||
|
|
||||||
|
The kernel runs at EL2 with VHE and EL1 without. Guest kernels always run
|
||||||
|
at EL1.
|
||||||
|
|
||||||
|
For the host this attribute will exclude EL1 and additionally EL2 on a VHE
|
||||||
|
system.
|
||||||
|
|
||||||
|
For the guest this attribute will exclude EL1. Please note that EL2 is
|
||||||
|
never counted within a guest.
|
||||||
|
|
||||||
|
|
||||||
|
exclude_hv
|
||||||
|
----------
|
||||||
|
|
||||||
|
This attribute excludes the hypervisor.
|
||||||
|
|
||||||
|
For a VHE host this attribute is ignored as we consider the host kernel to
|
||||||
|
be the hypervisor.
|
||||||
|
|
||||||
|
For a non-VHE host this attribute will exclude EL2 as we consider the
|
||||||
|
hypervisor to be any code that runs at EL2 which is predominantly used for
|
||||||
|
guest/host transitions.
|
||||||
|
|
||||||
|
For the guest this attribute has no effect. Please note that EL2 is
|
||||||
|
never counted within a guest.
|
||||||
|
|
||||||
|
|
||||||
|
exclude_host / exclude_guest
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
These attributes exclude the KVM host and guest, respectively.
|
||||||
|
|
||||||
|
The KVM host may run at EL0 (userspace), EL1 (non-VHE kernel) and EL2 (VHE
|
||||||
|
kernel or non-VHE hypervisor).
|
||||||
|
|
||||||
|
The KVM guest may run at EL0 (userspace) and EL1 (kernel).
|
||||||
|
|
||||||
|
Due to the overlapping exception levels between host and guests we cannot
|
||||||
|
exclusively rely on the PMU's hardware exception filtering - therefore we
|
||||||
|
must enable/disable counting on the entry and exit to the guest. This is
|
||||||
|
performed differently on VHE and non-VHE systems.
|
||||||
|
|
||||||
|
For non-VHE systems we exclude EL2 for exclude_host - upon entering and
|
||||||
|
exiting the guest we disable/enable the event as appropriate based on the
|
||||||
|
exclude_host and exclude_guest attributes.
|
||||||
|
|
||||||
|
For VHE systems we exclude EL1 for exclude_guest and exclude both EL0,EL2
|
||||||
|
for exclude_host. Upon entering and exiting the guest we modify the event
|
||||||
|
to include/exclude EL0 as appropriate based on the exclude_host and
|
||||||
|
exclude_guest attributes.
|
||||||
|
|
||||||
|
The statements above also apply when these attributes are used within a
|
||||||
|
non-VHE guest however please note that EL2 is never counted within a guest.
|
||||||
|
|
||||||
|
|
||||||
|
Accuracy
|
||||||
|
--------
|
||||||
|
|
||||||
|
On non-VHE hosts we enable/disable counters on the entry/exit of host/guest
|
||||||
|
transition at EL2 - however there is a period of time between
|
||||||
|
enabling/disabling the counters and entering/exiting the guest. We are
|
||||||
|
able to eliminate counters counting host events on the boundaries of guest
|
||||||
|
entry/exit when counting guest events by filtering out EL2 for
|
||||||
|
exclude_host. However when using !exclude_hv there is a small blackout
|
||||||
|
window at the guest entry/exit where host events are not captured.
|
||||||
|
|
||||||
|
On VHE systems there are no blackout windows.
|
||||||
@@ -87,7 +87,21 @@ used to get and set the keys for a thread.
|
|||||||
Virtualization
|
Virtualization
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
Pointer authentication is not currently supported in KVM guests. KVM
|
Pointer authentication is enabled in KVM guest when each virtual cpu is
|
||||||
will mask the feature bits from ID_AA64ISAR1_EL1, and attempted use of
|
initialised by passing flags KVM_ARM_VCPU_PTRAUTH_[ADDRESS/GENERIC] and
|
||||||
the feature will result in an UNDEFINED exception being injected into
|
requesting these two separate cpu features to be enabled. The current KVM
|
||||||
the guest.
|
guest implementation works by enabling both features together, so both
|
||||||
|
these userspace flags are checked before enabling pointer authentication.
|
||||||
|
The separate userspace flag will allow to have no userspace ABI changes
|
||||||
|
if support is added in the future to allow these two features to be
|
||||||
|
enabled independently of one another.
|
||||||
|
|
||||||
|
As Arm Architecture specifies that Pointer Authentication feature is
|
||||||
|
implemented along with the VHE feature so KVM arm64 ptrauth code relies
|
||||||
|
on VHE mode to be present.
|
||||||
|
|
||||||
|
Additionally, when these vcpu feature flags are not set then KVM will
|
||||||
|
filter out the Pointer Authentication system key registers from
|
||||||
|
KVM_GET/SET_REG_* ioctls and mask those features from cpufeature ID
|
||||||
|
register. Any attempt to use the Pointer Authentication instructions will
|
||||||
|
result in an UNDEFINED exception being injected into the guest.
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ stable kernels.
|
|||||||
| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 |
|
| ARM | Cortex-A76 | #1188873 | ARM64_ERRATUM_1188873 |
|
||||||
| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
|
| ARM | Cortex-A76 | #1165522 | ARM64_ERRATUM_1165522 |
|
||||||
| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
|
| ARM | Cortex-A76 | #1286807 | ARM64_ERRATUM_1286807 |
|
||||||
|
| ARM | Neoverse-N1 | #1188873 | ARM64_ERRATUM_1188873 |
|
||||||
| ARM | MMU-500 | #841119,#826419 | N/A |
|
| ARM | MMU-500 | #841119,#826419 | N/A |
|
||||||
| | | | |
|
| | | | |
|
||||||
| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
|
| Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 |
|
||||||
@@ -77,6 +78,7 @@ stable kernels.
|
|||||||
| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
|
| Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 |
|
||||||
| Hisilicon | Hip0{6,7} | #161010701 | N/A |
|
| Hisilicon | Hip0{6,7} | #161010701 | N/A |
|
||||||
| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 |
|
| Hisilicon | Hip07 | #161600802 | HISILICON_ERRATUM_161600802 |
|
||||||
|
| Hisilicon | Hip08 SMMU PMCG | #162001800 | N/A |
|
||||||
| | | | |
|
| | | | |
|
||||||
| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
|
| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 |
|
||||||
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
| Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 |
|
||||||
|
|||||||
@@ -34,6 +34,23 @@ model features for SVE is included in Appendix A.
|
|||||||
following sections: software that needs to verify that those interfaces are
|
following sections: software that needs to verify that those interfaces are
|
||||||
present must check for HWCAP_SVE instead.
|
present must check for HWCAP_SVE instead.
|
||||||
|
|
||||||
|
* On hardware that supports the SVE2 extensions, HWCAP2_SVE2 will also
|
||||||
|
be reported in the AT_HWCAP2 aux vector entry. In addition to this,
|
||||||
|
optional extensions to SVE2 may be reported by the presence of:
|
||||||
|
|
||||||
|
HWCAP2_SVE2
|
||||||
|
HWCAP2_SVEAES
|
||||||
|
HWCAP2_SVEPMULL
|
||||||
|
HWCAP2_SVEBITPERM
|
||||||
|
HWCAP2_SVESHA3
|
||||||
|
HWCAP2_SVESM4
|
||||||
|
|
||||||
|
This list may be extended over time as the SVE architecture evolves.
|
||||||
|
|
||||||
|
These extensions are also reported via the CPU ID register ID_AA64ZFR0_EL1,
|
||||||
|
which userspace can read using an MRS instruction. See elf_hwcaps.txt and
|
||||||
|
cpu-feature-registers.txt for details.
|
||||||
|
|
||||||
* Debuggers should restrict themselves to interacting with the target via the
|
* Debuggers should restrict themselves to interacting with the target via the
|
||||||
NT_ARM_SVE regset. The recommended way of detecting support for this regset
|
NT_ARM_SVE regset. The recommended way of detecting support for this regset
|
||||||
is to connect to a target process first and then attempt a
|
is to connect to a target process first and then attempt a
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
|
=============
|
||||||
On atomic bitops.
|
Atomic bitops
|
||||||
|
=============
|
||||||
|
|
||||||
While our bitmap_{}() functions are non-atomic, we have a number of operations
|
While our bitmap_{}() functions are non-atomic, we have a number of operations
|
||||||
operating on single bits in a bitmap that are atomic.
|
operating on single bits in a bitmap that are atomic.
|
||||||
|
|||||||
@@ -56,6 +56,23 @@ Barriers:
|
|||||||
smp_mb__{before,after}_atomic()
|
smp_mb__{before,after}_atomic()
|
||||||
|
|
||||||
|
|
||||||
|
TYPES (signed vs unsigned)
|
||||||
|
-----
|
||||||
|
|
||||||
|
While atomic_t, atomic_long_t and atomic64_t use int, long and s64
|
||||||
|
respectively (for hysterical raisins), the kernel uses -fno-strict-overflow
|
||||||
|
(which implies -fwrapv) and defines signed overflow to behave like
|
||||||
|
2s-complement.
|
||||||
|
|
||||||
|
Therefore, an explicitly unsigned variant of the atomic ops is strictly
|
||||||
|
unnecessary and we can simply cast, there is no UB.
|
||||||
|
|
||||||
|
There was a bug in UBSAN prior to GCC-8 that would generate UB warnings for
|
||||||
|
signed types.
|
||||||
|
|
||||||
|
With this we also conform to the C/C++ _Atomic behaviour and things like
|
||||||
|
P1236R1.
|
||||||
|
|
||||||
|
|
||||||
SEMANTICS
|
SEMANTICS
|
||||||
---------
|
---------
|
||||||
|
|||||||
@@ -20,13 +20,26 @@ for that device, by setting low_latency to 0. See Section 3 for
|
|||||||
details on how to configure BFQ for the desired tradeoff between
|
details on how to configure BFQ for the desired tradeoff between
|
||||||
latency and throughput, or on how to maximize throughput.
|
latency and throughput, or on how to maximize throughput.
|
||||||
|
|
||||||
BFQ has a non-null overhead, which limits the maximum IOPS that a CPU
|
As every I/O scheduler, BFQ adds some overhead to per-I/O-request
|
||||||
can process for a device scheduled with BFQ. To give an idea of the
|
processing. To give an idea of this overhead, the total,
|
||||||
limits on slow or average CPUs, here are, first, the limits of BFQ for
|
single-lock-protected, per-request processing time of BFQ---i.e., the
|
||||||
three different CPUs, on, respectively, an average laptop, an old
|
sum of the execution times of the request insertion, dispatch and
|
||||||
desktop, and a cheap embedded system, in case full hierarchical
|
completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
|
||||||
support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but
|
(dated CPU for notebooks; time measured with simple code
|
||||||
CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2):
|
instrumentation, and using the throughput-sync.sh script of the S
|
||||||
|
suite [1], in performance-profiling mode). To put this result into
|
||||||
|
context, the total, single-lock-protected, per-request execution time
|
||||||
|
of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
|
||||||
|
us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
|
||||||
|
|
||||||
|
Scheduling overhead further limits the maximum IOPS that a CPU can
|
||||||
|
process (already limited by the execution of the rest of the I/O
|
||||||
|
stack). To give an idea of the limits with BFQ, on slow or average
|
||||||
|
CPUs, here are, first, the limits of BFQ for three different CPUs, on,
|
||||||
|
respectively, an average laptop, an old desktop, and a cheap embedded
|
||||||
|
system, in case full hierarchical support is enabled (i.e.,
|
||||||
|
CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not
|
||||||
|
set (Section 4-2):
|
||||||
- Intel i7-4850HQ: 400 KIOPS
|
- Intel i7-4850HQ: 400 KIOPS
|
||||||
- AMD A8-3850: 250 KIOPS
|
- AMD A8-3850: 250 KIOPS
|
||||||
- ARM CortexTM-A53 Octa-core: 80 KIOPS
|
- ARM CortexTM-A53 Octa-core: 80 KIOPS
|
||||||
@@ -566,3 +579,5 @@ applications. Unset this tunable if you need/want to control weights.
|
|||||||
Slightly extended version:
|
Slightly extended version:
|
||||||
http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
|
http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
|
||||||
results.pdf
|
results.pdf
|
||||||
|
|
||||||
|
[3] https://github.com/Algodev-github/S
|
||||||
|
|||||||
@@ -93,3 +93,7 @@ zoned=[0/1]: Default: 0
|
|||||||
|
|
||||||
zone_size=[MB]: Default: 256
|
zone_size=[MB]: Default: 256
|
||||||
Per zone size when exposed as a zoned block device. Must be a power of two.
|
Per zone size when exposed as a zoned block device. Must be a power of two.
|
||||||
|
|
||||||
|
zone_nr_conv=[nr_conv]: Default: 0
|
||||||
|
The number of conventional zones to create when block device is zoned. If
|
||||||
|
zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user