linux/tools/workqueue/wq_monitor.py
Tejun Heo 8a1dd1e547 workqueue: Track and monitor per-workqueue CPU time usage
Now that wq_worker_tick() is there, we can easily track the rough CPU time
consumption of each workqueue by charging the whole tick whenever a tick
hits an active workqueue. While not super accurate, it provides reasonable
visibility into the workqueues that consume a lot of CPU cycles.
wq_monitor.py is updated to report the per-workqueue CPU times.

v2: wq_monitor.py was using "cputime" as the key when outputting in json
    format. Use "cpu_time" instead for consistency with other fields.

Signed-off-by: Tejun Heo <tj@kernel.org>
2023-05-17 17:02:09 -10:00

169 lines
5.9 KiB
Python

#!/usr/bin/env drgn
#
# Copyright (C) 2023 Tejun Heo <tj@kernel.org>
# Copyright (C) 2023 Meta Platforms, Inc. and affiliates.
desc = """
This is a drgn script to monitor workqueues. For more info on drgn, visit
https://github.com/osandov/drgn.
total Total number of work items executed by the workqueue.
infl The number of currently in-flight work items.
CPUtime Total CPU time consumed by the workqueue in seconds. This is
sampled from scheduler ticks and only provides ballpark
measurement. "nohz_full=" CPUs are excluded from measurement.
CPUitsv The number of times a concurrency-managed work item hogged CPU
longer than the threshold (workqueue.cpu_intensive_thresh_us)
and got excluded from concurrency management to avoid stalling
other work items.
CMwake The number of concurrency-management wake-ups while executing a
work item of the workqueue.
mayday The number of times the rescuer was requested while waiting for
new worker creation.
rescued The number of work items executed by the rescuer.
"""
import sys
import signal
import os
import re
import time
import json
import drgn
from drgn.helpers.linux.list import list_for_each_entry,list_empty
from drgn.helpers.linux.cpumask import for_each_possible_cpu
import argparse
parser = argparse.ArgumentParser(description=desc,
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('workqueue', metavar='REGEX', nargs='*',
help='Target workqueue name patterns (all if empty)')
parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1,
help='Monitoring interval (0 to print once and exit)')
parser.add_argument('-j', '--json', action='store_true',
help='Output in json')
args = parser.parse_args()
def err(s):
print(s, file=sys.stderr, flush=True)
sys.exit(1)
workqueues = prog['workqueues']
WQ_UNBOUND = prog['WQ_UNBOUND']
WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']
PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed
PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
PWQ_NR_STATS = prog['PWQ_NR_STATS']
class WqStats:
def __init__(self, wq):
self.name = wq.name.string_().decode()
self.unbound = wq.flags & WQ_UNBOUND != 0
self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0
self.stats = [0] * PWQ_NR_STATS
for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'):
for i in range(PWQ_NR_STATS):
self.stats[i] += int(pwq.stats[i])
def dict(self, now):
return { 'timestamp' : now,
'name' : self.name,
'unbound' : self.unbound,
'mem_reclaim' : self.mem_reclaim,
'started' : self.stats[PWQ_STAT_STARTED],
'completed' : self.stats[PWQ_STAT_COMPLETED],
'cpu_time' : self.stats[PWQ_STAT_CPU_TIME],
'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
'mayday' : self.stats[PWQ_STAT_MAYDAY],
'rescued' : self.stats[PWQ_STAT_RESCUED], }
def table_header_str():
return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\
f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
def table_row_str(self):
cpu_intensive = '-'
cm_wakeup = '-'
mayday = '-'
rescued = '-'
if not self.unbound:
cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
if self.mem_reclaim:
mayday = str(self.stats[PWQ_STAT_MAYDAY])
rescued = str(self.stats[PWQ_STAT_RESCUED])
out = f'{self.name[-24:]:24} ' \
f'{self.stats[PWQ_STAT_STARTED]:8} ' \
f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \
f'{cpu_intensive:>7} ' \
f'{cm_wakeup:>7} ' \
f'{mayday:>7} ' \
f'{rescued:>7} '
return out.rstrip(':')
exit_req = False
def sigint_handler(signr, frame):
global exit_req
exit_req = True
def main():
# handle args
table_fmt = not args.json
interval = args.interval
re_str = None
if args.workqueue:
for r in args.workqueue:
if re_str is None:
re_str = r
else:
re_str += '|' + r
filter_re = re.compile(re_str) if re_str else None
# monitoring loop
signal.signal(signal.SIGINT, sigint_handler)
while not exit_req:
now = time.time()
if table_fmt:
print()
print(WqStats.table_header_str())
for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'):
stats = WqStats(wq)
if filter_re and not filter_re.search(stats.name):
continue
if table_fmt:
print(stats.table_row_str())
else:
print(stats.dict(now))
if interval == 0:
break
time.sleep(interval)
if __name__ == "__main__":
main()