diff --git a/bpf/standalone/arc_prefetch.py b/bpf/standalone/arc_prefetch.py new file mode 100755 index 0000000..768f7b1 --- /dev/null +++ b/bpf/standalone/arc_prefetch.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, 2016 by Delphix. All rights reserved. +# + +# +# This script provides read latency data for prefetch I/O. +# +# usage: arc-prefetch.d +# + +from bcc import BPF +from time import sleep +from time import strftime +import argparse +import os +import sys + +# +# Find BCCHelper. If we are being run from the repo, we should be able to find +# it in the repo's lib/ directory. If we can't find that, look for BCCHelper +# in its install location. +# +base_dir = os.path.dirname(__file__) + "/../../" +if not os.path.exists(base_dir + "lib/bcchelper.py"): + base_dir = "/usr/share/performance-diagnostics/" +sys.path.append(base_dir + 'lib/') +# flake8 wants these at the top of the file, but path update must come first +from bcchelper import BCCHelper # noqa: E402 +from bcchelper import BCCMapIndex # noqa: E402 +from bcchelper import BCCPerCPUIntArray # noqa: E402 +from bcchelper import BCCPoolCompare # noqa: E402 + + +bpf_text = '#include "' + base_dir + 'lib/bcc_helper.h' + '"\n' +bpf_text += """ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +""" +parser = argparse.ArgumentParser( + description='Collect arc_prefetch statistics.', + usage='estat arc-prefetch [options]') +parser.add_argument('-p', '--pool', type=str, action='store', + dest='pool', + help='The pool to monitor (default: domain0)') +args = parser.parse_args() + +# Add pool POOL_COMPARE macro to the bpf_text C code +if (args.pool): + pool = args.pool +else: + pool = "domain0" +pool_compare = BCCPoolCompare(pool) +if not pool_compare.get_pool_pointer(): + print("Warning: No pool filtering, unable to find zfs pool " + pool) +bpf_text += pool_compare.get_pool_compare_code() + +bpf_text += """ +#define ARC_FLAG_PREFETCH (1 << 2) /* I/O is a prefetch */ + +typedef struct { + u64 zfs_read_ts; + u64 arc_read_ts; + zio_t *zio; +}arc_prefetch_info_t; + +typedef struct { + u64 t; + u32 index; + u64 cpuid; +} lat_key; + +HIST_KEY(hist_lat_key, lat_key); + +BPF_HASH(arc_prefetch_info, u32, arc_prefetch_info_t); +BPF_HASH(zio_read_exit_time, zio_t *, u64); + +BPF_HASH(read_latency, hist_lat_key, u64); +BPF_HASH(read_average, lat_key, average_t); +BPF_PERCPU_ARRAY(arc_count, u32, NCOUNT_INDEX); + +int zfs_read_entry(struct pt_regs *ctx, struct inode *ip) +{ + u32 tid = bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + arc_prefetch_info_t info = {ts}; + + // filter by pool + zfsvfs_t *zfsvfs = ip->i_sb->s_fs_info; + objset_t *z_os = zfsvfs->z_os; + spa_t *spa = z_os->os_spa; + if (POOL_COMPARE(spa)) + arc_prefetch_info.update(&tid, &info); + + return 0; +} + +int zfs_read_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + + if (info == NULL) + return 0; + + if (info != NULL) + arc_prefetch_info.delete(&tid); + return 0; +} + + +int arc_hit_entry(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zfs_read_ts == 0) + return 0; + + arc_count.increment(ARC_HIT_COUNT); + return 0; +} + +int arc_miss_entry(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zfs_read_ts == 0) + return 0; + + arc_count.increment(ARC_MISS_COUNT); + return 0; +} + +int arc_read_entry(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zfs_read_ts == 0) + return 0; + + info->arc_read_ts = bpf_ktime_get_ns(); + return 0; +} + +int arc_read_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->arc_read_ts == 0) + return 0; + + u64 elapsed = (bpf_ktime_get_ns() - info->arc_read_ts) / 1000; + lat_key lkey = {1, ARC_ISSUE_LATENCY, 0}; + lkey.cpuid = bpf_get_smp_processor_id(); + u32 slot = bpf_log2l(elapsed); + HIST_KEY_INITIALIZE(hist_lat_key, hkey, lkey, slot); + read_latency.increment(hkey); + + average_t *average = read_average.lookup(&lkey); + if (average == NULL) { + average_t initial_average = {1, elapsed}; + read_average.update(&lkey, &initial_average); + return 0; + } + average->count += 1; + average->sum += elapsed; + return 0; +} + +int zio_read_entry(struct pt_regs *ctx, zio_t *zio) +{ + u32 tid = bpf_get_current_pid_tgid(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->arc_read_ts == 0) + return 0; + + info->zio = zio; + return 0; +} + +int zio_read_return(struct pt_regs *ctx) +{ + u32 tid = bpf_get_current_pid_tgid(); + u64 zio_exit_ts = bpf_ktime_get_ns(); + arc_prefetch_info_t *info = arc_prefetch_info.lookup(&tid); + if (info == NULL || info->zio == NULL) + return 0; + + zio_read_exit_time.update(&info->zio, &zio_exit_ts); + return 0; +} + +int arc_read_done_entry(struct pt_regs *ctx, zio_t *zio) +{ + arc_buf_hdr_t *hdr = (arc_buf_hdr_t *) zio->io_private; + u64 zero = 0; + + u64 *zio_exit_ts = zio_read_exit_time.lookup(&zio); + if (zio_exit_ts == NULL || *zio_exit_ts == 0) { + return 0; + } + + u64 zio_ts = *zio_exit_ts; + u64 elapsed = (bpf_ktime_get_ns() - zio_ts) / 1000; + + hist_lat_key hkey = { }; + lat_key *lkey = HIST_KEY_GET_AGGKEY(&hkey); + u32 count_index; + if (hdr->b_flags & ARC_FLAG_PREFETCH) { + count_index = ARC_PREFETCH_ZIO_COUNT; + lkey->index = ARC_PREFETCH_ZIO_LATENCY; + } else { // normal zio + count_index = ARC_NORMAL_ZIO_COUNT; + lkey->index = ARC_NORMAL_ZIO_LATENCY; + } + lkey->t = 1; + lkey->cpuid = bpf_get_smp_processor_id(); + + HIST_KEY_SET_SLOT(&hkey, bpf_log2(elapsed)); + arc_count.increment(count_index); + read_latency.increment(hkey); + zio_read_exit_time.update(&zio, &zero); + average_t *average = read_average.lookup(lkey); + if (average == NULL) { + average_t initial_average = {1, elapsed}; + read_average.update(lkey, &initial_average); + return 0; + } + average->count += 1; + average->sum += elapsed; + + return 0; + } + +""" + + +class ArcCountIndex(BCCMapIndex): + ARC_HIT_COUNT = (0, 'ARC_HIT_COUNT', 'prefetch hit count') + ARC_MISS_COUNT = (1, 'ARC_MISS_COUNT', 'prefetch miss count') + ARC_NORMAL_ZIO_COUNT = (2, 'ARC_NORMAL_ZIO_COUNT', 'normal read count') + ARC_PREFETCH_ZIO_COUNT = (3, 'ARC_PREFETCH_ZIO_COUNT', + 'prefetch read count') + + +class ArcLatencyIndex(BCCMapIndex): + ARC_ISSUE_LATENCY = (0, 'ARC_ISSUE_LATENCY', 'arc read latency') + ARC_NORMAL_ZIO_AVERAGE = (1, 'ARC_NORMAL_ZIO_LATENCY', + 'normal read latency') + ARC_PREFETCH_ZIO_AVERAGE = (2, 'ARC_PREFETCH_ZIO_LATENCY', + 'prefetch read latency') + + +KVER = os.popen('uname -r').read().rstrip() + +flags = ["-include", + "/usr/src/zfs-" + KVER + "/zfs_config.h", + "-include", + "/usr/src/zfs-" + KVER + "/include/spl/sys/types.h", + "-I/usr/src/zfs-" + KVER + "/include/", + "-I/usr/src/zfs-" + KVER + "/include/spl/", + "-I/usr/src/zfs-" + KVER + "/include/linux", + "-DCC_USING_FENTRY", + "-DNCOUNT_INDEX=" + str(len(ArcCountIndex)), + "-DNAVERAGE_INDEX=" + str(len(ArcLatencyIndex))] \ + + ArcCountIndex.getCDefinitions() \ + + ArcLatencyIndex.getCDefinitions() + +b = BPF(text=bpf_text, cflags=flags) + +b.attach_kprobe(event="zfs_read", fn_name="zfs_read_entry") +b.attach_kretprobe(event="zfs_read", fn_name="zfs_read_return") +b.attach_kprobe(event="trace_zfs_arc__hit", fn_name="arc_hit_entry") +b.attach_kprobe(event="trace_zfs_arc__miss", fn_name="arc_miss_entry") +b.attach_kprobe(event="arc_read", fn_name="arc_read_entry") +b.attach_kretprobe(event="arc_read", fn_name="arc_read_return") +b.attach_kprobe(event="zio_read", fn_name="zio_read_entry") +b.attach_kretprobe(event="zio_read", fn_name="zio_read_return") +b.attach_kprobe(event="arc_read_done", fn_name="arc_read_done_entry") + +read_latency_helper = BCCHelper(b, BCCHelper.ESTAT_PRINT_MODE) +read_latency_helper.add_aggregation("read_latency", + BCCHelper.LOG_HISTOGRAM_AGGREGATION, + "microseconds") +read_latency_helper.add_aggregation("read_average", + BCCHelper.AVERAGE_AGGREGATION, + "avg latency(us)") +read_latency_helper.add_key_type("index", BCCHelper.MAP_INDEX_TYPE, + ArcLatencyIndex) +call_count_helper = BCCPerCPUIntArray(b, "arc_count", ArcCountIndex) + + +while True: + try: + sleep(5) + except KeyboardInterrupt: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + call_count_helper.printall() + read_latency_helper.printall() + break + try: + print("%-16s\n" % strftime("%D - %H:%M:%S %Z")) + call_count_helper.printall() + read_latency_helper.printall() + except Exception as e: + print(str(e)) + break diff --git a/lib/bcc_helper.h b/lib/bcc_helper.h index ad2c443..d9d4926 100644 --- a/lib/bcc_helper.h +++ b/lib/bcc_helper.h @@ -26,6 +26,14 @@ typedef struct { \ u64 slot; \ } hist_key_type; +#define HIST_KEY_INITIALIZE(hist_key_type, hist_key, agg_key, slot) \ + hist_key_type hist_key = {agg_key, slot}; +#define HIST_KEY_GET_AGGKEY(hist_key_ptr) (&(hist_key_ptr)->agg_key) +#define HIST_KEY_GET_SLOT(hist_key_ptr) ((hist_key_ptr)->slot) +#define HIST_KEY_SET_SLOT(hist_key_ptr, nslot) (hist_key_ptr)->slot = nslot; +#define HIST_KEY_SET_AGGKEY(hist_key_ptr, agg_key) (hist_key_ptr)->agg_key = agg_key; + + /* * This function returns the slot, or histogram bucket, for a value based * on log linear distribution equivalent to dtrace llquantize(*, 10, 4, 10, 10). diff --git a/lib/bcchelper.py b/lib/bcchelper.py index 2e109fd..f424fe4 100755 --- a/lib/bcchelper.py +++ b/lib/bcchelper.py @@ -5,9 +5,11 @@ # from time import time +from enum import Enum import logging import re import signal +import subprocess """ BCC Helper This module contains the BCCHelper class to aid in writing BCC Tracing @@ -105,6 +107,8 @@ class BCCHelper: # DEFAULT_KEY_TYPE = 0 IP_KEY_TYPE = 1 + CUSTOM_KEY_TYPE = 2 + MAP_INDEX_TYPE = 3 # Format string for print histograms ESTAT_HIST_FORMAT = '{:<16} {:>16} {:<41}' ESTAT_HEADER_FORMAT = ' {:<20}{:>49}' @@ -232,11 +236,11 @@ def apply_normalization(self, name, value): value = value // n[1] return value - def add_key_type(self, name, type=None): + def add_key_type(self, name, type=None, display_function=None): """This key type expected in all aggregation keys.""" if type is None: type = self.DEFAULT_KEY_TYPE - self.key_types.append([name, type]) + self.key_types.append([name, type, display_function]) def walltime(self, ktime): """Convert to an epoch timestamp.""" @@ -414,6 +418,10 @@ def key_type_string(self, key, key_type): # if match: keystr = str(match.group()) + elif key_type[1] == self.CUSTOM_KEY_TYPE: + keystr = key_type[2](keystr) + elif key_type[1] == self.MAP_INDEX_TYPE: + keystr = key_type[2].displayName(int(keystr)) return keystr @@ -640,8 +648,9 @@ def items_to_string(self, agg_items): base_key = self.next_key(agg_items) if self.mode == self.ESTAT_PRINT_MODE: - outstr += estat_scalar_header + "\n" - outstr += estat_scalar_table + "\n\n" + if estat_scalar_table != "": + outstr += estat_scalar_header + "\n" + outstr += estat_scalar_table + "\n\n" return outstr @@ -650,3 +659,90 @@ def printall(self, clear=True): agg_items = self.get_ordered_items(clear) outstr = self.items_to_string(agg_items) print(outstr, end='', flush=True) + + +class BCCMapIndex(Enum): + """Provides index constants for bcc map keys""" + def __init__(self, index, csymbol, display): + self.index = index + self.csymbol = csymbol + self.display = display + + def index(self): + """The int value for this index.""" + return self.index + + def display(self): + """The display string for this index""" + return self.display + + def definition(self): + """The CPP definition for this index""" + return "-D" + self.csymbol + "=" + str(self.index) + "" + + @classmethod + def list(cls): + """List all indices in a subclass""" + return list(map(lambda c: c.value, cls)) + + @classmethod + def displayName(cls, i): + """The display name for a subclass index""" + return list(map(lambda c: c.value, cls))[i][2] + + @classmethod + def getCDefinitions(cls): + """Definitions for bcc C code""" + defs = [] + for index in cls: + defs.append(index.definition()) + return defs + + +class BCCPerCPUIntArray: + """Provide output support for a PerCPU Array indexed by a BCCMapIndex""" + def __init__(self, b, name, mapindex): + self.table = b[name] + self.name = name + self.mapindex = mapindex + + def printall(self, clear=True): + """ Print and clear all data from aggregations in the helper.""" + items = self.table.items() + if (clear): + self.table.clear() + + for item in items: + key = item[0] + value = item[1] + sum = 0 + # sum the values for each cpu + for v in value: + sum += v + if sum > 0: + print('{:20}'.format(self.mapindex.displayName(key.value)) + + ' : ' + str(sum)) + + +class BCCPoolCompare: + """Provide code to test for a zfs pool instance""" + def __init__(self, pool_name): + """leverage sdb to get a pointer for a pool name""" + self.pool_name = pool_name + pool_match = re.search(r'0x\w+', subprocess.getoutput( + "sudo sdb -e \'spa " + self.pool_name + "\'")) + if pool_match: + self.pool_pointer = pool_match.group(0) + else: + self.pool_pointer = None + + def get_pool_pointer(self): + """Return the kernel address for the spa_t object""" + return self.pool_pointer + + def get_pool_compare_code(self): + """Generate C POOL_COMPARE macro""" + if not self.pool_pointer: + return "#define POOL_COMPARE(spa) 1" + return str("#define POOL_COMPARE(spa) ((ulong)spa == " + + self.pool_pointer + ")")