diff --git a/.github/actions/veristat_baseline_compare/action.yml b/.github/actions/veristat_baseline_compare/action.yml new file mode 100644 index 0000000000000..f6dd81d19e4b3 --- /dev/null +++ b/.github/actions/veristat_baseline_compare/action.yml @@ -0,0 +1,49 @@ +name: 'run-veristat' +description: 'Run veristat benchmark' +inputs: + veristat_output: + description: 'Veristat output filepath' + required: true + baseline_name: + description: 'Veristat baseline cache name' + required: true +runs: + using: "composite" + steps: + - uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.baseline_name }} + if-no-files-found: error + path: ${{ github.workspace }}/${{ inputs.veristat_output }} + + # For pull request: + # - get baseline log from cache + # - compare it to current run + - if: ${{ github.event_name == 'pull_request' }} + uses: actions/cache/restore@v4 + with: + key: ${{ github.base_ref }}-${{ inputs.baseline_name }}- + restore-keys: | + ${{ github.base_ref }}-${{ inputs.baseline_name }} + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' + + - if: ${{ github.event_name == 'pull_request' }} + name: Show veristat comparison + shell: bash + run: ./.github/scripts/compare-veristat-results.sh + env: + BASELINE_PATH: ${{ github.workspace }}/${{ inputs.baseline_name }} + VERISTAT_OUTPUT: ${{ inputs.veristat_output }} + + # For push: just put baseline log to cache + - if: ${{ github.event_name == 'push' }} + shell: bash + run: | + mv "${{ github.workspace }}/${{ inputs.veristat_output }}" \ + "${{ github.workspace }}/${{ inputs.baseline_name }}" + + - if: ${{ github.event_name == 'push' }} + uses: actions/cache/save@v4 + with: + key: ${{ github.ref_name }}-${{ inputs.baseline_name }}-${{ github.run_id }} + path: '${{ github.workspace }}/${{ inputs.baseline_name }}' diff --git a/.github/scripts/collect-scx-bpf-progs.sh b/.github/scripts/collect-scx-bpf-progs.sh new file mode 100755 index 0000000000000..e2bb051ed0e9e --- /dev/null +++ b/.github/scripts/collect-scx-bpf-progs.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -euo pipefail + +PROGS_DIR=$1 + +mkdir -p "${PROGS_DIR}" + +find "${SCX_BUILD_OUTPUT}" -type f -name "bpf.bpf.o" -print0 | \ +while IFS= read -r -d '' prog; do + obj_name=$(echo "$prog" | grep -o "scx.*.bpf.o" | tr / _) + cp -v "$prog" "${PROGS_DIR}/${obj_name}" +done diff --git a/.github/scripts/compare-veristat-results.sh b/.github/scripts/compare-veristat-results.sh new file mode 100755 index 0000000000000..5bc761a9f8792 --- /dev/null +++ b/.github/scripts/compare-veristat-results.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +if [[ ! -f "${BASELINE_PATH}" ]]; then + echo "# No ${BASELINE_PATH} available" >> "${GITHUB_STEP_SUMMARY}" + + echo "No ${BASELINE_PATH} available" + echo "Printing veristat results" + cat "${VERISTAT_OUTPUT}" + + exit 0 +fi + +veristat=$(realpath selftests/bpf/veristat) +cmp_out=$(mktemp veristate_compare_out_XXXXXX.csv) + +$veristat \ + --output-format csv \ + --emit file,prog,verdict,states \ + --compare "${BASELINE_PATH}" "${VERISTAT_OUTPUT}" > $cmp_out + +python3 ./.github/scripts/veristat_compare.py $cmp_out +exit_code=$? + +echo +# if comparison failed, print verifier log for failure mismatches +if [[ -n "$VERISTAT_DUMP_LOG_ON_FAILURE" && $exit_code -ne 0 ]]; then + cat $cmp_out | tail -n +1 | \ + while read -r line; do + verdict=$(echo $line | cut -d',' -f4) + verdict_diff=$(echo $line | cut -d',' -f5) + if [[ "$verdict" == "failure" && "$verdict_diff" == "MISMATCH" ]]; then + file=$(echo $line | cut -d',' -f1) + prog=$(echo $line | cut -d',' -f2) + echo "VERIFIER LOG FOR $file/$prog:" + echo "==================================================================" + $veristat -v $VERISTAT_OBJECTS_DIR/$file -f $prog 2>&1 + echo "==================================================================" + fi + done +fi + +exit $exit_code diff --git a/.github/scripts/download-gcc-bpf.sh b/.github/scripts/download-gcc-bpf.sh new file mode 100755 index 0000000000000..894584a01b2ec --- /dev/null +++ b/.github/scripts/download-gcc-bpf.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -euo pipefail + +GCC_BPF_RELEASE_GH_REPO=$1 +INSTALL_DIR=$(realpath $2) + +cd /tmp + +tag=$(gh release list -L 1 -R ${GCC_BPF_RELEASE_GH_REPO} --json tagName -q .[].tagName) +if [[ -z "$tag" ]]; then + echo "Could not find latest GCC BPF release at ${GCC_BPF_RELEASE_GH_REPO}" + exit 1 +fi + +url="https://github.com/${GCC_BPF_RELEASE_GH_REPO}/releases/download/${tag}/${tag}.tar.zst" +echo "Downloading $url" +wget -q "$url" + +tarball=${tag}.tar.zst +dir=$(tar tf $tarball | head -1 || true) + +echo "Extracting $tarball ..." +tar -I zstd -xf $tarball && rm -f $tarball + +rm -rf $INSTALL_DIR +mv -v $dir $INSTALL_DIR + +cd - + diff --git a/.github/scripts/matrix.py b/.github/scripts/matrix.py new file mode 100644 index 0000000000000..220146e55d7db --- /dev/null +++ b/.github/scripts/matrix.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 + +import dataclasses +import json +import os + +from enum import Enum +from typing import Any, Dict, Final, List, Optional, Set, Union + +import requests + +MANAGED_OWNER: Final[str] = "kernel-patches" +MANAGED_REPOS: Final[Set[str]] = { + f"{MANAGED_OWNER}/bpf", + f"{MANAGED_OWNER}/vmtest", +} + +DEFAULT_SELF_HOSTED_RUNNER_TAGS: Final[List[str]] = ["self-hosted", "docker-noble-main"] +DEFAULT_GITHUB_HOSTED_RUNNER: Final[str] = "ubuntu-24.04" +DEFAULT_GCC_VERSION: Final[int] = 14 +DEFAULT_LLVM_VERSION: Final[int] = 20 + +RUNNERS_BUSY_THRESHOLD: Final[float] = 0.8 + + +class Arch(str, Enum): + """ + CPU architecture supported by CI. + """ + + AARCH64 = "aarch64" + S390X = "s390x" + X86_64 = "x86_64" + + +class Compiler(str, Enum): + GCC = "gcc" + LLVM = "llvm" + + +def query_runners_from_github() -> List[Dict[str, Any]]: + if "GITHUB_TOKEN" not in os.environ: + return [] + token = os.environ["GITHUB_TOKEN"] + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + owner = os.environ["GITHUB_REPOSITORY_OWNER"] + url: Optional[str] = f"https://api.github.com/orgs/{owner}/actions/runners" + # GitHub returns 30 runners per page, fetch all + all_runners = [] + try: + while url is not None: + response = requests.get(url, headers=headers) + if response.status_code != 200: + print(f"Failed to query runners: {response.status_code}") + print(f"response: {response.text}") + return [] + data = response.json() + all_runners.extend(data.get("runners", [])) + # Check for next page URL in Link header + url = None + if "Link" in response.headers: + links = requests.utils.parse_header_links(response.headers["Link"]) + for link in links: + if link["rel"] == "next": + url = link["url"] + break + return all_runners + except Exception as e: + print(f"Warning: Failed to query runner status due to exception: {e}") + return [] + + +all_runners_cached: Optional[List[Dict[str, Any]]] = None + + +def all_runners() -> List[Dict[str, Any]]: + global all_runners_cached + if all_runners_cached is None: + print("Querying runners from GitHub...") + all_runners_cached = query_runners_from_github() + print(f"Github returned {len(all_runners_cached)} runners") + counts = count_by_status(all_runners_cached) + print( + f"Busy: {counts['busy']}, Idle: {counts['idle']}, Offline: {counts['offline']}" + ) + return all_runners_cached + + +def runner_labels(runner: Dict[str, Any]) -> List[str]: + return [label["name"] for label in runner["labels"]] + + +def is_self_hosted_runner(runner: Dict[str, Any]) -> bool: + labels = runner_labels(runner) + for label in DEFAULT_SELF_HOSTED_RUNNER_TAGS: + if label not in labels: + return False + return True + + +def self_hosted_runners() -> List[Dict[str, Any]]: + runners = all_runners() + return [r for r in runners if is_self_hosted_runner(r)] + + +def runners_by_arch(arch: Arch) -> List[Dict[str, Any]]: + runners = self_hosted_runners() + return [r for r in runners if arch.value in runner_labels(r)] + + +def count_by_status(runners: List[Dict[str, Any]]) -> Dict[str, int]: + result = {"busy": 0, "idle": 0, "offline": 0} + for runner in runners: + if runner["status"] == "online": + if runner["busy"]: + result["busy"] += 1 + else: + result["idle"] += 1 + else: + result["offline"] += 1 + return result + + +@dataclasses.dataclass +class BuildConfig: + arch: Arch + kernel_compiler: Compiler = Compiler.GCC + gcc_version: int = DEFAULT_GCC_VERSION + llvm_version: int = DEFAULT_LLVM_VERSION + kernel: str = "LATEST" + run_veristat: bool = False + parallel_tests: bool = False + build_release: bool = False + + @property + def runs_on(self) -> List[str]: + if is_managed_repo(): + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [self.arch.value] + else: + return [DEFAULT_GITHUB_HOSTED_RUNNER] + + @property + def build_runs_on(self) -> List[str]: + if not is_managed_repo(): + return [DEFAULT_GITHUB_HOSTED_RUNNER] + + # @Temporary: disable codebuild runners for cross-compilation jobs + match self.arch: + case Arch.S390X: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [Arch.X86_64.value] + case Arch.AARCH64: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [Arch.AARCH64.value] + + # For managed repos, check the busyness of relevant self-hosted runners + # If they are too busy, use codebuild + runner_arch = self.arch + # We don't build s390x kernel on s390x runners, because it's too slow + # Cross-compiling on x86_64 is faster + if runner_arch == Arch.S390X: + runner_arch = Arch.X86_64 + runners = runners_by_arch(runner_arch) + counts = count_by_status(runners) + online = counts["idle"] + counts["busy"] + busy = counts["busy"] + # if online <= 0, then something is wrong, don't use codebuild + if online > 0 and busy / online > RUNNERS_BUSY_THRESHOLD: + return ["codebuild"] + else: + return DEFAULT_SELF_HOSTED_RUNNER_TAGS + [runner_arch.value] + + @property + def tests(self) -> Dict[str, Any]: + tests_list = [ + "test_progs", + "test_progs_parallel", + "test_progs_no_alu32", + "test_progs_no_alu32_parallel", + "test_verifier", + ] + + if self.arch.value != "s390x": + tests_list.append("test_maps") + + if self.llvm_version >= 18: + tests_list.append("test_progs_cpuv4") + + if self.arch in [Arch.X86_64, Arch.AARCH64]: + tests_list.append("sched_ext") + + # Don't run GCC BPF runner, because too many tests are failing + # See: https://lore.kernel.org/bpf/87bjw6qpje.fsf@oracle.com/ + # if self.arch == Arch.X86_64: + # tests_list.append("test_progs-bpf_gcc") + + if not self.parallel_tests: + tests_list = [test for test in tests_list if not test.endswith("parallel")] + + return {"include": [generate_test_config(test) for test in tests_list]} + + def to_dict(self) -> Dict[str, Any]: + return { + "arch": self.arch.value, + "kernel_compiler": self.kernel_compiler.value, + "gcc_version": DEFAULT_GCC_VERSION, + "llvm_version": DEFAULT_LLVM_VERSION, + "kernel": self.kernel, + "run_veristat": self.run_veristat, + "parallel_tests": self.parallel_tests, + "build_release": self.build_release, + "runs_on": self.runs_on, + "tests": self.tests, + "build_runs_on": self.build_runs_on, + } + + +def is_managed_repo() -> bool: + return ( + os.environ["GITHUB_REPOSITORY_OWNER"] == MANAGED_OWNER + and os.environ["GITHUB_REPOSITORY"] in MANAGED_REPOS + ) + + +def set_output(name, value): + """Write an output variable to the GitHub output file.""" + with open(os.getenv("GITHUB_OUTPUT"), "a", encoding="utf-8") as file: + file.write(f"{name}={value}\n") + + +def generate_test_config(test: str) -> Dict[str, Union[str, int]]: + """Create the configuration for the provided test.""" + is_parallel = test.endswith("_parallel") + config = { + "test": test, + "continue_on_error": is_parallel, + # While in experimental mode, parallel jobs may get stuck + # anywhere, including in user space where the kernel won't detect + # a problem and panic. We add a second layer of (smaller) timeouts + # here such that if we get stuck in a parallel run, we hit this + # timeout and fail without affecting the overall job success (as + # would be the case if we hit the job-wide timeout). For + # non-experimental jobs, 360 is the default which will be + # superseded by the overall workflow timeout (but we need to + # specify something). + "timeout_minutes": 30 if is_parallel else 360, + } + return config + + +if __name__ == "__main__": + matrix = [ + BuildConfig( + arch=Arch.X86_64, + run_veristat=True, + parallel_tests=True, + ), + BuildConfig( + arch=Arch.X86_64, + kernel_compiler=Compiler.LLVM, + build_release=True, + ), + BuildConfig( + arch=Arch.AARCH64, + ), + BuildConfig( + arch=Arch.S390X, + ), + ] + + # Outside of managed repositories only run on x86_64 + if not is_managed_repo(): + matrix = [config for config in matrix if config.arch == Arch.X86_64] + + json_matrix = json.dumps({"include": [config.to_dict() for config in matrix]}) + print(json.dumps(json.loads(json_matrix), indent=4)) + set_output("build_matrix", json_matrix) diff --git a/.github/scripts/tests/test_veristat_compare.py b/.github/scripts/tests/test_veristat_compare.py new file mode 100644 index 0000000000000..b65b69295235d --- /dev/null +++ b/.github/scripts/tests/test_veristat_compare.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import unittest +from typing import Iterable, List + +from ..veristat_compare import parse_table, VeristatFields + + +def gen_csv_table(records: Iterable[str]) -> List[str]: + return [ + ",".join(VeristatFields.headers()), + *records, + ] + + +class TestVeristatCompare(unittest.TestCase): + def test_parse_table_ignore_new_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,N/A,success,N/A,N/A,1,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_ignore_removed_prog(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,N/A,N/A,1,N/A,N/A", + ] + ) + veristat_info = parse_table(table) + self.assertEqual(veristat_info.table, []) + self.assertFalse(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + def test_parse_table_new_failure(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,success,failure,MISMATCH,1,1,+0 (+0.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [["prog_file.bpf.o", "prog_name", "success -> failure (!!)", "+0.00 %"]], + ) + self.assertTrue(veristat_info.changes) + self.assertTrue(veristat_info.new_failures) + + def test_parse_table_new_changes(self): + table = gen_csv_table( + [ + "prog_file.bpf.o,prog_name,failure,success,MISMATCH,0,0,+0 (+0.00%)", + "prog_file.bpf.o,prog_name_increase,failure,failure,MATCH,1,2,+1 (+100.00%)", + "prog_file.bpf.o,prog_name_decrease,success,success,MATCH,1,1,-1 (-100.00%)", + ] + ) + veristat_info = parse_table(table) + self.assertEqual( + veristat_info.table, + [ + ["prog_file.bpf.o", "prog_name", "failure -> success", "+0.00 %"], + ["prog_file.bpf.o", "prog_name_increase", "failure", "+100.00 %"], + ["prog_file.bpf.o", "prog_name_decrease", "success", "-100.00 %"], + ], + ) + self.assertTrue(veristat_info.changes) + self.assertFalse(veristat_info.new_failures) + + +if __name__ == "__main__": + unittest.main() diff --git a/.github/scripts/tmpfsify-workspace.sh b/.github/scripts/tmpfsify-workspace.sh new file mode 100755 index 0000000000000..6fd62b4ad2a49 --- /dev/null +++ b/.github/scripts/tmpfsify-workspace.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -x -euo pipefail + +TMPFS_SIZE=20 # GB +MEM_TOTAL=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo) + +# sanity check: total mem is at least double TMPFS_SIZE +if [ $MEM_TOTAL -lt $(($TMPFS_SIZE*1024*2)) ]; then + echo "tmpfsify-workspace.sh: will not allocate tmpfs, total memory is too low (${MEM_TOTAL}MB)" + exit 0 +fi + +dir="$(basename "$GITHUB_WORKSPACE")" +cd "$(dirname "$GITHUB_WORKSPACE")" +mv "${dir}" "${dir}.backup" +mkdir "${dir}" +sudo mount -t tmpfs -o size=${TMPFS_SIZE}G tmpfs "${dir}" +rsync -a "${dir}.backup/" "${dir}" +cd - + diff --git a/.github/scripts/veristat_compare.py b/.github/scripts/veristat_compare.py new file mode 100644 index 0000000000000..07271b8cbd3aa --- /dev/null +++ b/.github/scripts/veristat_compare.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +# This script reads a CSV file produced by the following invocation: +# +# veristat --emit file,prog,verdict,states \ +# --output-format csv \ +# --compare ... +# +# And produces a markdown summary for the file. +# The summary is printed to standard output and appended to a file +# pointed to by GITHUB_STEP_SUMMARY variable. +# +# Script exits with return code 1 if there are new failures in the +# veristat results. +# +# For testing purposes invoke as follows: +# +# GITHUB_STEP_SUMMARY=/dev/null python3 veristat-compare.py test.csv +# +# File format (columns): +# 0. file_name +# 1. prog_name +# 2. verdict_base +# 3. verdict_comp +# 4. verdict_diff +# 5. total_states_base +# 6. total_states_comp +# 7. total_states_diff +# +# Records sample: +# file-a,a,success,failure,MISMATCH,12,12,+0 (+0.00%) +# file-b,b,success,success,MATCH,67,67,+0 (+0.00%) +# +# For better readability suffixes '_OLD' and '_NEW' +# are used instead of '_base' and '_comp' for variable +# names etc. + +import io +import os +import sys +import re +import csv +import logging +import argparse +import enum +from dataclasses import dataclass +from typing import Dict, Iterable, List, Final + + +TRESHOLD_PCT: Final[int] = 0 + +SUMMARY_HEADERS = ["File", "Program", "Verdict", "States Diff (%)"] + +# expected format: +0 (+0.00%) / -0 (-0.00%) +TOTAL_STATES_DIFF_REGEX = ( + r"(?P[+-]\d+) \((?P[+-]\d+\.\d+)\%\)" +) + + +TEXT_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +{table} +""".strip() +) + +HTML_SUMMARY_TEMPLATE: Final[str] = ( + """ +# {title} + +
+Click to expand + +{table} +
+""".strip() +) + +GITHUB_MARKUP_REPLACEMENTS: Final[Dict[str, str]] = { + "->": "→", + "(!!)": ":bangbang:", +} + +NEW_FAILURE_SUFFIX: Final[str] = "(!!)" + + +class VeristatFields(str, enum.Enum): + FILE_NAME = "file_name" + PROG_NAME = "prog_name" + VERDICT_OLD = "verdict_base" + VERDICT_NEW = "verdict_comp" + VERDICT_DIFF = "verdict_diff" + TOTAL_STATES_OLD = "total_states_base" + TOTAL_STATES_NEW = "total_states_comp" + TOTAL_STATES_DIFF = "total_states_diff" + + @classmethod + def headers(cls) -> List[str]: + return [ + cls.FILE_NAME, + cls.PROG_NAME, + cls.VERDICT_OLD, + cls.VERDICT_NEW, + cls.VERDICT_DIFF, + cls.TOTAL_STATES_OLD, + cls.TOTAL_STATES_NEW, + cls.TOTAL_STATES_DIFF, + ] + + +@dataclass +class VeristatInfo: + table: list + changes: bool + new_failures: bool + + def get_results_title(self) -> str: + if self.new_failures: + return "There are new veristat failures" + + if self.changes: + return "There are changes in verification performance" + + return "No changes in verification performance" + + def get_results_summary(self, markup: bool = False) -> str: + title = self.get_results_title() + if not self.table: + return f"# {title}\n" + + template = TEXT_SUMMARY_TEMPLATE + table = format_table(headers=SUMMARY_HEADERS, rows=self.table) + + if markup: + template = HTML_SUMMARY_TEMPLATE + table = github_markup_decorate(table) + + return template.format(title=title, table=table) + + +def get_state_diff(value: str) -> float: + if value == "N/A": + return 0.0 + + matches = re.match(TOTAL_STATES_DIFF_REGEX, value) + if not matches: + raise ValueError(f"Failed to parse total states diff field value '{value}'") + + if percentage_diff := matches.group("percentage_diff"): + return float(percentage_diff) + + raise ValueError(f"Invalid {VeristatFields.TOTAL_STATES_DIFF} field value: {value}") + + +def parse_table(csv_file: Iterable[str]) -> VeristatInfo: + reader = csv.DictReader(csv_file) + assert reader.fieldnames == VeristatFields.headers() + + new_failures = False + changes = False + table = [] + + for record in reader: + add = False + + verdict_old, verdict_new = ( + record[VeristatFields.VERDICT_OLD], + record[VeristatFields.VERDICT_NEW], + ) + + # Ignore results from completely new and removed programs + if "N/A" in [verdict_new, verdict_old]: + continue + + if record[VeristatFields.VERDICT_DIFF] == "MISMATCH": + changes = True + add = True + verdict = f"{verdict_old} -> {verdict_new}" + if verdict_new == "failure": + new_failures = True + verdict += f" {NEW_FAILURE_SUFFIX}" + else: + verdict = record[VeristatFields.VERDICT_NEW] + + diff = get_state_diff(record[VeristatFields.TOTAL_STATES_DIFF]) + if abs(diff) > TRESHOLD_PCT: + changes = True + add = True + + if not add: + continue + + table.append( + [ + record[VeristatFields.FILE_NAME], + record[VeristatFields.PROG_NAME], + verdict, + f"{diff:+.2f} %", + ] + ) + + return VeristatInfo(table=table, changes=changes, new_failures=new_failures) + + +def github_markup_decorate(input_str: str) -> str: + for text, markup in GITHUB_MARKUP_REPLACEMENTS.items(): + input_str = input_str.replace(text, markup) + return input_str + + +def format_table(headers: List[str], rows: List[List[str]]) -> str: + column_width = [ + max(len(row[column_idx]) for row in [headers] + rows) + for column_idx in range(len(headers)) + ] + + # Row template string in the following format: + # "{0:8}|{1:10}|{2:15}|{3:7}|{4:10}" + row_template = "|".join( + f"{{{idx}:{width}}}" for idx, width in enumerate(column_width) + ) + row_template_nl = f"|{row_template}|\n" + + with io.StringIO() as out: + out.write(row_template_nl.format(*headers)) + + separator_row = ["-" * width for width in column_width] + out.write(row_template_nl.format(*separator_row)) + + for row in rows: + row_str = row_template_nl.format(*row) + out.write(row_str) + + return out.getvalue() + + +def main(compare_csv_filename: os.PathLike, output_filename: os.PathLike) -> None: + with open(compare_csv_filename, newline="", encoding="utf-8") as csv_file: + veristat_results = parse_table(csv_file) + + sys.stdout.write(veristat_results.get_results_summary()) + + with open(output_filename, encoding="utf-8", mode="a") as file: + file.write(veristat_results.get_results_summary(markup=True)) + + if veristat_results.new_failures: + return 1 + + return 0 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Print veristat comparison output as markdown step summary" + ) + parser.add_argument("filename") + args = parser.parse_args() + summary_filename = os.getenv("GITHUB_STEP_SUMMARY") + if not summary_filename: + logging.error("GITHUB_STEP_SUMMARY environment variable is not set") + sys.exit(1) + sys.exit(main(args.filename, summary_filename)) diff --git a/.github/workflows/gcc-bpf.yml b/.github/workflows/gcc-bpf.yml new file mode 100644 index 0000000000000..5f05234399d33 --- /dev/null +++ b/.github/workflows/gcc-bpf.yml @@ -0,0 +1,103 @@ +name: Testing GCC BPF compiler + +on: + workflow_call: + inputs: + runs_on: + required: true + type: string + arch: + required: true + type: string + gcc_version: + required: true + type: string + llvm_version: + required: true + type: string + toolchain: + required: true + type: string + toolchain_full: + required: true + type: string + download_sources: + required: true + type: boolean + +jobs: + test: + name: GCC BPF + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARCH: ${{ inputs.arch }} + BPF_NEXT_BASE_BRANCH: 'master' + GCC_BPF_INSTALL_DIR: ${{ github.workspace }}/gcc-bpf + GCC_BPF_RELEASE_REPO: 'theihor/gcc-bpf' + KBUILD_OUTPUT: ${{ github.workspace }}/src/kbuild-output + REPO_ROOT: ${{ github.workspace }}/src + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + uses: libbpf/ci/get-linux-source@v3 + with: + dest: ${{ env.REPO_ROOT }} + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + + - if: ${{ ! inputs.download_sources }} + name: Checkout ${{ github.repository }} to ./src + uses: actions/checkout@v4 + with: + path: 'src' + + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: ${{ env.REPO_ROOT }} + + - name: Untar artifacts + working-directory: ${{ env.REPO_ROOT }} + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + gcc-version: ${{ inputs.gcc_version }} + llvm-version: ${{ inputs.llvm_version }} + + - name: Download GCC BPF compiler + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: .github/scripts/download-gcc-bpf.sh ${{ env.GCC_BPF_RELEASE_REPO }} ${{ env.GCC_BPF_INSTALL_DIR }} + + - name: Build selftests/bpf/test_progs-bpf_gcc + uses: libbpf/ci/build-selftests@v3 + env: + BPF_GCC: ${{ env.GCC_BPF_INSTALL_DIR }} + MAX_MAKE_JOBS: 32 + SELFTESTS_BPF_TARGETS: 'test_progs-bpf_gcc' + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.REPO_ROOT }} + llvm-version: ${{ inputs.llvm_version }} + toolchain: ${{ inputs.toolchain }} diff --git a/.github/workflows/kernel-build-test.yml b/.github/workflows/kernel-build-test.yml new file mode 100644 index 0000000000000..ceb47761e905b --- /dev/null +++ b/.github/workflows/kernel-build-test.yml @@ -0,0 +1,167 @@ +name: Reusable Build/Test/Veristat workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + build_runs_on: + required: true + type: string + description: The runners to run the builds on. This is a json string representing an array of labels. + gcc_version: + required: true + type: string + description: GCC version to install + llvm_version: + required: true + type: string + description: LLVM version to install + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + tests: + required: true + type: string + description: A serialized json array with the tests to be running, it must follow the json-matrix format, https://www.jitsejan.com/use-github-actions-with-json-file-as-matrix + run_veristat: + required: true + type: boolean + description: Whether or not to run the veristat job. + run_tests: + required: true + type: boolean + description: Whether or not to run the test job. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + build_release: + required: true + type: boolean + description: Build selftests with -O2 optimization in addition to non-optimized build. + default: false + secrets: + AWS_ROLE_ARN: + required: true + +jobs: + + # Build kernel and selftest + build: + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + gcc_version: ${{ inputs.gcc_version }} + llvm_version: ${{ inputs.llvm_version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + + build-release: + if: ${{ inputs.build_release }} + uses: ./.github/workflows/kernel-build.yml + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + toolchain: ${{ inputs.toolchain }} + runs_on: ${{ inputs.build_runs_on }} + gcc_version: ${{ inputs.gcc_version }} + llvm_version: ${{ inputs.llvm_version }} + kernel: ${{ inputs.kernel }} + download_sources: ${{ inputs.download_sources }} + release: true + + test: + if: ${{ inputs.run_tests }} + uses: ./.github/workflows/kernel-test.yml + # Setting name to test here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: "test" + needs: [build] + strategy: + fail-fast: false + matrix: ${{ fromJSON(inputs.tests) }} + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + kernel: ${{ inputs.kernel }} + test: ${{ matrix.test }} + continue_on_error: ${{ toJSON(matrix.continue_on_error) }} + timeout_minutes: ${{ matrix.timeout_minutes }} + + veristat-kernel: + if: ${{ inputs.run_veristat }} + uses: ./.github/workflows/veristat-kernel.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + + veristat-meta: + # Check for vars.AWS_REGION is necessary to skip this job in case of a PR from a fork. + if: ${{ inputs.run_veristat && github.repository_owner == 'kernel-patches' && vars.AWS_REGION }} + uses: ./.github/workflows/veristat-meta.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + aws_region: ${{ vars.AWS_REGION }} + runs_on: ${{ inputs.runs_on }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + veristat-scx: + if: ${{ inputs.run_veristat }} + uses: ./.github/workflows/veristat-scx.yml + needs: [build] + permissions: + id-token: write + contents: read + with: + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + runs_on: ${{ inputs.runs_on }} + llvm_version: ${{ inputs.llvm_version }} + + gcc-bpf: + name: 'GCC BPF' + if: ${{ inputs.arch == 'x86_64' }} + uses: ./.github/workflows/gcc-bpf.yml + needs: [build] + with: + # GCC BPF does not need /dev/kvm, so use the "build" runners + runs_on: ${{ inputs.build_runs_on }} + arch: ${{ inputs.arch }} + gcc_version: ${{ inputs.gcc_version }} + llvm_version: ${{ inputs.llvm_version }} + toolchain: ${{ inputs.toolchain }} + toolchain_full: ${{ inputs.toolchain_full }} + download_sources: ${{ inputs.download_sources }} + diff --git a/.github/workflows/kernel-build.yml b/.github/workflows/kernel-build.yml new file mode 100644 index 0000000000000..db572cd245a0f --- /dev/null +++ b/.github/workflows/kernel-build.yml @@ -0,0 +1,196 @@ + +name: Reusable build workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + toolchain: + required: true + type: string + description: The toolchain, e.g gcc, llvm + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + gcc_version: + required: true + type: string + description: GCC version to install + llvm_version: + required: true + type: string + description: LLVM version to install + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + download_sources: + required: true + type: boolean + description: Whether to download the linux sources into the working directory. + default: false + release: + required: false + type: boolean + description: Build selftest with -O2 optimization + default: false + +jobs: + build: + name: build kernel and selftests ${{ inputs.release && '-O2' || '' }} + # To run on CodeBuild, runs-on value must correspond to the AWS + # CodeBuild project associated with the kernel-patches webhook + # However matrix.py passes just a 'codebuild' string + runs-on: >- + ${{ + contains(fromJSON(inputs.runs_on), 'codebuild') + && format('codebuild-bpf-ci-{0}-{1}', github.run_id, github.run_attempt) + || fromJSON(inputs.runs_on) + }} + env: + ARTIFACTS_ARCHIVE: "vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst" + BPF_NEXT_BASE_BRANCH: 'master' + BPF_NEXT_FETCH_DEPTH: 64 # A bit of history is needed to facilitate incremental builds + CROSS_COMPILE: ${{ inputs.arch != 'x86_64' && 'true' || '' }} + BUILD_SCHED_EXT_SELFTESTS: ${{ inputs.arch == 'x86_64' || inputs.arch == 'aarch64' && 'true' || '' }} + KBUILD_OUTPUT: ${{ github.workspace }}/kbuild-output + KERNEL: ${{ inputs.kernel }} + KERNEL_ROOT: ${{ github.workspace }} + REPO_PATH: "" + REPO_ROOT: ${{ github.workspace }} + RUNNER_TYPE: ${{ contains(fromJSON(inputs.runs_on), 'codebuild') && 'codebuild' || 'default' }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: ${{ inputs.download_sources && 1 || env.BPF_NEXT_FETCH_DEPTH }} + + - if: ${{ env.RUNNER_TYPE == 'codebuild' }} + shell: bash + run: .github/scripts/tmpfsify-workspace.sh + + - if: ${{ inputs.download_sources }} + name: Download bpf-next tree + env: + FETCH_DEPTH: ${{ env.BPF_NEXT_FETCH_DEPTH }} + uses: libbpf/ci/get-linux-source@v3 + with: + dest: '.kernel' + rev: ${{ env.BPF_NEXT_BASE_BRANCH }} + - uses: libbpf/ci/prepare-incremental-build@v3 + with: + repo-root: ${{ inputs.download_sources && '.kernel' || env.REPO_ROOT }} + base-branch: >- + ${{ inputs.download_sources && env.BPF_NEXT_BASE_BRANCH + || github.event_name == 'pull_request' && github.base_ref + || github.ref_name + }} + arch: ${{ inputs.arch }} + toolchain_full: ${{ inputs.toolchain_full }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + - if: ${{ inputs.download_sources }} + name: Move linux source in place + shell: bash + run: | + cd .kernel + rm -rf .git + mv -t .. $(ls -A) + cd .. + rmdir .kernel + - uses: libbpf/ci/patch-kernel@v3 + with: + patches-root: '${{ github.workspace }}/ci/diffs' + repo-root: ${{ env.REPO_ROOT }} + + - name: Setup build environment + uses: libbpf/ci/setup-build-env@v3 + with: + arch: ${{ inputs.arch }} + gcc-version: ${{ inputs.gcc_version }} + llvm-version: ${{ inputs.llvm_version }} + pahole: master + + # We have to setup qemu+binfmt in order to enable cross-compation of selftests. + # During selftests build, freshly built bpftool is executed. + # On self-hosted bare-metal hosts binfmt is pre-configured. + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Set up docker + uses: docker/setup-docker-action@v4 + - if: ${{ env.RUNNER_TYPE == 'codebuild' && env.CROSS_COMPILE }} + name: Setup binfmt and qemu + uses: docker/setup-qemu-action@v3 + with: + image: tonistiigi/binfmt:qemu-v9.2.0 + + - name: Build kernel image + uses: libbpf/ci/build-linux@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm_version }} + + - name: Build selftests/bpf + uses: libbpf/ci/build-selftests@v3 + env: + MAX_MAKE_JOBS: 32 + RELEASE: ${{ inputs.release && '1' || '' }} + with: + arch: ${{ inputs.arch }} + kernel-root: ${{ env.KERNEL_ROOT }} + llvm-version: ${{ inputs.llvm_version }} + toolchain: ${{ inputs.toolchain }} + + - if: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + name: Build selftests/sched_ext + uses: libbpf/ci/build-scx-selftests@v3 + with: + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + llvm-version: ${{ inputs.llvm_version }} + max-make-jobs: 32 + + - if: ${{ github.event_name != 'push' }} + name: Build samples + uses: libbpf/ci/build-samples@v3 + with: + arch: ${{ inputs.arch }} + toolchain: ${{ inputs.toolchain }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + max-make-jobs: 32 + llvm-version: ${{ inputs.llvm_version }} + - name: Tar artifacts + id: tar-artifacts + uses: libbpf/ci/tar-artifacts@v3 + env: + ARCHIVE_BPF_SELFTESTS: 'true' + ARCHIVE_MAKE_HELPERS: 'true' + ARCHIVE_SCHED_EXT_SELFTESTS: ${{ env.BUILD_SCHED_EXT_SELFTESTS }} + with: + arch: ${{ inputs.arch }} + archive: ${{ env.ARTIFACTS_ARCHIVE }} + kbuild-output: ${{ env.KBUILD_OUTPUT }} + repo-root: ${{ env.REPO_ROOT }} + - if: ${{ github.event_name != 'push' }} + name: Remove KBUILD_OUTPUT content + shell: bash + run: | + # Remove $KBUILD_OUTPUT to prevent cache creation for pull requests. + # Only on pushed changes are build artifacts actually cached, because + # of github.com/actions/cache's cache isolation logic. + rm -rf "${KBUILD_OUTPUT}" + - uses: actions/upload-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}${{ inputs.release && '-release' || '' }} + if-no-files-found: error + path: ${{ env.ARTIFACTS_ARCHIVE }} diff --git a/.github/workflows/kernel-test.yml b/.github/workflows/kernel-test.yml new file mode 100644 index 0000000000000..2885f2759de4a --- /dev/null +++ b/.github/workflows/kernel-test.yml @@ -0,0 +1,96 @@ +name: Reusable test workflow + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: The toolchain and for llvm, its version, e.g gcc, llvm-15 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + kernel: + required: true + type: string + description: The kernel to run the test against. For KPD this is always LATEST, which runs against a newly built kernel. + test: + required: true + type: string + description: The test to run in the vm, e.g test_progs, test_maps, test_progs_no_alu32... + continue_on_error: + required: true + type: string + description: Whether to continue on error. This is typically set to true for parallel tests which are currently known to fail, but we don't want to fail the whole CI because of that. + timeout_minutes: + required: true + type: number + description: In case a test runs for too long, after how many seconds shall we timeout and error. + +jobs: + test: + name: ${{ inputs.test }} on ${{ inputs.arch }} with ${{ inputs.toolchain_full }} + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + env: + ARCH: ${{ inputs.arch }} + KERNEL: ${{ inputs.kernel }} + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + CONTINUE_ON_ERROR: ${{ inputs.continue_on_error }} + DEPLOYMENT: ${{ github.repository == 'kernel-patches/bpf' && 'prod' || 'rc' }} + ALLOWLIST_FILE: /tmp/allowlist + DENYLIST_FILE: /tmp/denylist + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: . + + - name: Untar artifacts + # zstd is installed by default in the runner images. + run: zstd -d -T0 vmlinux-${{ inputs.arch }}-${{ inputs.toolchain_full }}.tar.zst --stdout | tar -xf - + + - name: Run selftests + uses: libbpf/ci/run-vmtest@v3 + # https://github.com/actions/runner/issues/1483#issuecomment-1031671517 + # booleans are weird in GH. + continue-on-error: ${{ fromJSON(env.CONTINUE_ON_ERROR) }} + timeout-minutes: ${{ inputs.timeout_minutes }} + env: + ARCH: ${{ inputs.arch }} + DEPLOYMENT: ${{ env.DEPLOYMENT }} + KERNEL_TEST: ${{ inputs.test }} + SELFTESTS_BPF: ${{ github.workspace }}/selftests/bpf + VMTEST_CONFIGS: ${{ github.workspace }}/ci/vmtest/configs + TEST_PROGS_TRAFFIC_MONITOR: ${{ inputs.arch == 'x86_64' && 'true' || '' }} + TEST_PROGS_WATCHDOG_TIMEOUT: 600 + with: + arch: ${{ inputs.arch }} + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: ${{ env.REPO_ROOT }} + max-cpu: 8 + kernel-test: ${{ inputs.test }} + # Here we must use kbuild-output local to the repo, because + # it was extracted from the artifacts. + kbuild-output: ${{ env.REPO_ROOT }}/kbuild-output + + - if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: tmon-logs-${{ inputs.arch }}-${{ inputs.toolchain_full }}-${{ inputs.test }} + if-no-files-found: ignore + path: /tmp/tmon_pcap/* diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000..1c910fd297309 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,65 @@ +name: "lint" + +on: + pull_request: + push: + branches: + - master + +jobs: + shellcheck: + # This workflow gets injected into other Linux repositories, but we don't + # want it to run there. + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: ShellCheck + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run ShellCheck + uses: ludeeus/action-shellcheck@master + env: + SHELLCHECK_OPTS: --severity=warning --exclude=SC1091 + + # Ensure some consistency in the formatting. + lint: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run black + uses: psf/black@stable + with: + src: ./.github/scripts + + validate_matrix: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Validate matrix.py + runs-on: ubuntu-latest + env: + GITHUB_REPOSITORY_OWNER: ${{ matrix.owner }} + GITHUB_REPOSITORY: ${{ matrix.repository }} + GITHUB_OUTPUT: /dev/stdout + strategy: + matrix: + owner: ['kernel-patches', 'foo'] + repository: ['bpf', 'vmtest', 'bar'] + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: run script + run: | + python3 .github/scripts/matrix.py + + unittests: + if: ${{ github.repository == 'kernel-patches/vmtest' }} + name: Unittests + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Run unittests + run: python3 -m unittest scripts/tests/*.py + working-directory: .github diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000000000..24773459a252d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,73 @@ +name: bpf-ci + +on: + pull_request: + push: + branches: + - bpf_base + - bpf-next_base + - bpf-net_base + - for-next_base + +concurrency: + group: ci-test-${{ github.ref_name }} + cancel-in-progress: true + +jobs: + set-matrix: + # FIXME: set-matrix is lightweight, run it on any self-hosted machines for kernel-patches org + # so we do not wait for GH hosted runners when there potentially all are busy because of bpf-rc + # repo for instance. + # This could be somehow fixed long term by making this action/workflow re-usable and letting the called + # specify what to run on. + runs-on: ${{ github.repository_owner == 'kernel-patches' && 'x86_64' || 'ubuntu-latest' }} + permissions: read-all + outputs: + build-matrix: ${{ steps.set-matrix-impl.outputs.build_matrix }} + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + - name: Install script dependencies + shell: bash + run: | + sudo apt-get -y update + sudo apt-get -y install python3-requests + - id: set-matrix-impl + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_READ_RUNNERS }} + run: | + python3 .github/scripts/matrix.py + + build-and-test: + # Setting name to arch-compiler here to avoid lengthy autogenerated names due to matrix + # e.g build-and-test x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc + name: ${{ matrix.arch }} ${{ matrix.kernel_compiler }}-${{ matrix.kernel_compiler == 'gcc' && matrix.gcc_version || matrix.llvm_version }} + uses: ./.github/workflows/kernel-build-test.yml + needs: [set-matrix] + permissions: + id-token: write + contents: read + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.set-matrix.outputs.build-matrix) }} + with: + arch: ${{ matrix.arch }} + toolchain: ${{ matrix.kernel_compiler }} + toolchain_full: ${{ matrix.kernel_compiler }}-${{ matrix.kernel_compiler == 'gcc' && matrix.gcc_version || matrix.llvm_version }} + runs_on: ${{ toJSON(matrix.runs_on) }} + build_runs_on: ${{ toJSON(matrix.build_runs_on) }} + gcc_version: ${{ matrix.gcc_version }} + llvm_version: ${{ matrix.llvm_version }} + kernel: ${{ matrix.kernel }} + tests: ${{ toJSON(matrix.tests) }} + run_veristat: ${{ matrix.run_veristat }} + # We only run tests on pull requests. + run_tests: ${{ github.event_name != 'push' }} + # Download sources + download_sources: ${{ github.repository == 'kernel-patches/vmtest' }} + build_release: ${{ matrix.build_release }} + secrets: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} diff --git a/.github/workflows/veristat-kernel.yml b/.github/workflows/veristat-kernel.yml new file mode 100644 index 0000000000000..8c9ba715bf277 --- /dev/null +++ b/.github/workflows/veristat-kernel.yml @@ -0,0 +1,66 @@ +name: veristat_kernel + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: Toolchain identifier, such as llvm-20 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + +jobs: + veristat: + name: veristat-kernel + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain_full }} + VERISTAT_DUMP_LOG_ON_FAILURE: 'true' + VERISTAT_TARGET: kernel + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.kernel.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-kernel + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-kernel diff --git a/.github/workflows/veristat-meta.yml b/.github/workflows/veristat-meta.yml new file mode 100644 index 0000000000000..675127d322491 --- /dev/null +++ b/.github/workflows/veristat-meta.yml @@ -0,0 +1,88 @@ +name: veristat_meta + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: Toolchain identifier, such as llvm-20 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + aws_region: + required: true + type: string + description: The AWS region where we pull bpf objects to run against veristat. + secrets: + AWS_ROLE_ARN: + required: true + description: The AWS role used by GH to pull BPF objects from AWS. + +jobs: + veristat: + name: veristat-meta + runs-on: ${{ fromJSON(inputs.runs_on) }} + timeout-minutes: 100 + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain_full }} + VERISTAT_TARGET: meta + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + aws-region: ${{ inputs.aws_region }} + role-to-assume: ${{ secrets.AWS_ROLE_ARN }} + role-session-name: github-action-bpf-ci + + - name: Download BPF objects + run: | + mkdir ./bpf_objects + aws s3 sync s3://veristat-bpf-binaries ./bpf_objects + env: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + max-cpu: 8 + kernel-test: 'run_veristat' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.meta.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-meta + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-meta + diff --git a/.github/workflows/veristat-scx.yml b/.github/workflows/veristat-scx.yml new file mode 100644 index 0000000000000..e2310be83e638 --- /dev/null +++ b/.github/workflows/veristat-scx.yml @@ -0,0 +1,103 @@ +name: veristat_kernel + +on: + workflow_call: + inputs: + arch: + required: true + type: string + description: The architecture to build against, e.g x86_64, aarch64, s390x... + toolchain_full: + required: true + type: string + description: Toolchain identifier, such as llvm-20 + runs_on: + required: true + type: string + description: The runners to run the test on. This is a json string representing an array of labels. + llvm_version: + required: true + type: string + +jobs: + + build-scheds: + name: build sched-ext/scx + runs-on: ${{ fromJSON(inputs.runs_on) }} + env: + LLVM_VERSION: ${{ inputs.llvm_version }} + SCX_BUILD_OUTPUT: ${{ github.workspace }}/scx-build-output + SCX_PROGS: ${{ github.workspace }}/scx-progs + SCX_REVISION: main + steps: + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + - uses: libbpf/ci/build-scx-scheds@v3 + with: + output-dir: ${{ env.SCX_BUILD_OUTPUT }} + - name: Collect scx progs + run: ${{ github.workspace }}/.github/scripts/collect-scx-bpf-progs.sh ${{ env.SCX_PROGS }} + - name: Upload scx progs + uses: actions/upload-artifact@v4 + with: + name: scx-progs-${{ inputs.arch }}-${{ inputs.toolchain_full }} + if-no-files-found: error + path: ${{ env.SCX_PROGS }} + + veristat: + name: veristat-scx + runs-on: ${{ fromJSON(inputs.runs_on) }} + needs: [build-scheds] + permissions: + id-token: write + contents: read + env: + KERNEL: LATEST + REPO_ROOT: ${{ github.workspace }} + REPO_PATH: "" + KBUILD_OUTPUT: kbuild-output/ + ARCH_AND_TOOL: ${{ inputs.arch }}-${{ inputs.toolchain_full }} + VERISTAT_DUMP_LOG_ON_FAILURE: 'true' + VERISTAT_TARGET: scx + SCX_PROGS: ${{ github.workspace }}/scx-progs + + steps: + + - uses: actions/checkout@v4 + with: + sparse-checkout: | + .github + ci + + - name: Download kernel build artifacts + uses: actions/download-artifact@v4 + with: + name: vmlinux-${{ env.ARCH_AND_TOOL }} + path: . + + - name: Untar kernel build artifacts + run: zstd -d -T0 vmlinux-${{ env.ARCH_AND_TOOL }}.tar.zst --stdout | tar -xf - + + - name: Download scx progs + uses: actions/download-artifact@v4 + with: + name: scx-progs-${{ inputs.arch }}-${{ inputs.toolchain_full }} + path: ${{ env.SCX_PROGS }} + + - name: Run veristat + uses: libbpf/ci/run-vmtest@v3 + with: + arch: x86_64 + vmlinuz: '${{ github.workspace }}/vmlinuz' + kernel-root: '.' + kernel-test: 'run_veristat' + output-dir: '${{ github.workspace }}' + + - name: Compare and save veristat.scx.csv + uses: ./.github/actions/veristat_baseline_compare + with: + veristat_output: veristat-scx + baseline_name: ${{ env.ARCH_AND_TOOL}}-baseline-veristat-scx diff --git a/.mailmap b/.mailmap index d57531dab08b4..7f87f4d31df18 100644 --- a/.mailmap +++ b/.mailmap @@ -223,6 +223,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> Dmitry Safonov <0x7f454c46@gmail.com> Domen Puncer Douglas Gilbert + Ed L. Cashin Elliot Berman Enric Balletbo i Serra @@ -668,6 +669,7 @@ Muchun Song Ross Zwisler Rudolf Marek Rui Saraiva +Sachin Mokashi Sachin P Sant Sai Prakash Ranjan Sakari Ailus @@ -830,3 +832,6 @@ Yosry Ahmed Yusuke Goda Zack Rusin Zhu Yanjun +Zijun Hu +Zijun Hu +Zijun Hu diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 20fabdf6567e1..9c6cd52f69cf7 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -311,6 +311,27 @@ crashkernel syntax crashkernel=0,low +4) crashkernel=size,cma + + Reserve additional crash kernel memory from CMA. This reservation is + usable by the first system's userspace memory and kernel movable + allocations (memory balloon, zswap). Pages allocated from this memory + range will not be included in the vmcore so this should not be used if + dumping of userspace memory is intended and it has to be expected that + some movable kernel pages may be missing from the dump. + + A standard crashkernel reservation, as described above, is still needed + to hold the crash kernel and initrd. + + This option increases the risk of a kdump failure: DMA transfers + configured by the first kernel may end up corrupting the second + kernel's memory. + + This reservation method is intended for systems that can't afford to + sacrifice enough memory for standard crashkernel reservation and where + less reliable and possibly incomplete kdump is preferable to no kdump at + all. + Boot into System Kernel ----------------------- 1) Update the boot loader (such as grub, yaboot, or lilo) configuration diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1f2c0874da9d..ac4a239b9388c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -986,6 +986,28 @@ 0: to disable low allocation. It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. + crashkernel=size[KMG],cma + [KNL, X86] Reserve additional crash kernel memory from + CMA. This reservation is usable by the first system's + userspace memory and kernel movable allocations (memory + balloon, zswap). Pages allocated from this memory range + will not be included in the vmcore so this should not + be used if dumping of userspace memory is intended and + it has to be expected that some movable kernel pages + may be missing from the dump. + + A standard crashkernel reservation, as described above, + is still needed to hold the crash kernel and initrd. + + This option increases the risk of a kdump failure: DMA + transfers configured by the first kernel may end up + corrupting the second kernel's memory. + + This reservation method is intended for systems that + can't afford to sacrifice enough memory for standard + crashkernel reservation and where less reliable and + possibly incomplete kdump is preferable to no kdump at + all. cryptomgr.notests [KNL] Disable crypto self-tests diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst index e3cfffef5a633..c1768d9e80fa2 100644 --- a/Documentation/admin-guide/syscall-user-dispatch.rst +++ b/Documentation/admin-guide/syscall-user-dispatch.rst @@ -53,20 +53,25 @@ following prctl: prctl(PR_SET_SYSCALL_USER_DISPATCH, , , , [selector]) - is either PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF, to enable and -disable the mechanism globally for that thread. When -PR_SYS_DISPATCH_OFF is used, the other fields must be zero. - -[, +) delimit a memory region interval -from which syscalls are always executed directly, regardless of the -userspace selector. This provides a fast path for the C library, which -includes the most common syscall dispatchers in the native code -applications, and also provides a way for the signal handler to return + is either PR_SYS_DISPATCH_EXCLUSIVE_ON/PR_SYS_DISPATCH_INCLUSIVE_ON +or PR_SYS_DISPATCH_OFF, to enable and disable the mechanism globally for +that thread. When PR_SYS_DISPATCH_OFF is used, the other fields must be zero. + +For PR_SYS_DISPATCH_EXCLUSIVE_ON [, +) delimit +a memory region interval from which syscalls are always executed directly, +regardless of the userspace selector. This provides a fast path for the +C library, which includes the most common syscall dispatchers in the native +code applications, and also provides a way for the signal handler to return without triggering a nested SIGSYS on (rt\_)sigreturn. Users of this interface should make sure that at least the signal trampoline code is included in this region. In addition, for syscalls that implement the trampoline code on the vDSO, that trampoline is never intercepted. +For PR_SYS_DISPATCH_INCLUSIVE_ON [, +) delimit +a memory region interval from which syscalls are dispatched based on +the userspace selector. Syscalls from outside of the range are always +executed directly. + [selector] is a pointer to a char-sized region in the process memory region, that provides a quick way to enable disable syscall redirection thread-wide, without the need to invoke the kernel directly. selector diff --git a/MAINTAINERS b/MAINTAINERS index efb51ee92683d..2ef5c15070508 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15676,6 +15676,8 @@ MEMBLOCK AND MEMORY MANAGEMENT INITIALIZATION M: Mike Rapoport L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git for-next +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git fixes F: Documentation/core-api/boot-time-mm.rst F: Documentation/core-api/kho/bindings/memblock/* F: include/linux/memblock.h @@ -15848,6 +15850,17 @@ F: mm/numa.c F: mm/numa_emulation.c F: mm/numa_memblks.c +MEMORY MANAGEMENT - OOM KILLER +M: Michal Hocko +R: David Rientjes +R: Shakeel Butt +L: linux-mm@kvack.org +S: Maintained +F: include/linux/oom.h +F: include/trace/events/oom.h +F: include/uapi/linux/oom.h +F: mm/oom_kill.c + MEMORY MANAGEMENT - PAGE ALLOCATOR M: Andrew Morton M: Vlastimil Babka @@ -15862,8 +15875,17 @@ F: include/linux/compaction.h F: include/linux/gfp.h F: include/linux/page-isolation.h F: mm/compaction.c +F: mm/debug_page_alloc.c +F: mm/fail_page_alloc.c F: mm/page_alloc.c +F: mm/page_ext.c +F: mm/page_frag_cache.c F: mm/page_isolation.c +F: mm/page_owner.c +F: mm/page_poison.c +F: mm/page_reporting.c +F: mm/show_mem.c +F: mm/shuffle.c MEMORY MANAGEMENT - RECLAIM M: Andrew Morton @@ -23027,6 +23049,7 @@ F: drivers/md/md* F: drivers/md/raid* F: include/linux/raid/ F: include/uapi/linux/raid/ +F: lib/raid6/ SOLIDRUN CLEARFOG SUPPORT M: Russell King diff --git a/README b/README index fd903645e6de0..e69de29bb2d1d 100644 --- a/README +++ b/README @@ -1,18 +0,0 @@ -Linux kernel -============ - -There are several guides for kernel developers and users. These guides can -be rendered in a number of formats, like HTML and PDF. Please read -Documentation/admin-guide/README.rst first. - -In order to build the documentation, use ``make htmldocs`` or -``make pdfdocs``. The formatted documentation can also be read online at: - - https://www.kernel.org/doc/html/latest/ - -There are various text files in the Documentation/ subdirectory, -several of them using the reStructuredText markup notation. - -Please read the Documentation/process/changes.rst file, as it contains the -requirements for building and running the kernel, and information about -the problems which may result by upgrading your kernel. diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index b1bfbd11980d9..d38f4d6759e46 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -79,10 +80,12 @@ mk_resource_name(int pe, int port, char *str) { char tmp[80]; char *name; - - sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port); - name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES); - strcpy(name, tmp); + size_t sz; + + sz = scnprintf(tmp, sizeof(tmp), "PCI %s PE %d PORT %d", str, pe, port); + sz += 1; /* NUL terminator */ + name = memblock_alloc_or_panic(sz, SMP_CACHE_BYTES); + strscpy(name, tmp, sz); return name; } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index a41c93988d2c6..0bfd66c7ada05 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -1004,7 +1004,7 @@ static void __init reserve_crashkernel(void) total_mem = get_total_mem(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, - NULL, NULL); + NULL, NULL, NULL); /* invalid value specified or crashkernel=0 */ if (ret || !crash_size) return; diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20d..3d96fb41d6245 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 28be048db3f63..bceeaec21fb93 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -19,7 +19,7 @@ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) +#define __WARN_FLAGS(cond_str, flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) #define HAVE_ARCH_BUG diff --git a/arch/arm64/kernel/watchdog_hld.c b/arch/arm64/kernel/watchdog_hld.c index dcd25322127c3..e55548cb26df0 100644 --- a/arch/arm64/kernel/watchdog_hld.c +++ b/arch/arm64/kernel/watchdog_hld.c @@ -34,3 +34,61 @@ bool __init arch_perf_nmi_is_available(void) */ return arm_pmu_irq_is_nmi(); } + +static int watchdog_perf_update_period(void *data) +{ + int cpu = raw_smp_processor_id(); + u64 max_cpu_freq, new_period; + + max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; + if (!max_cpu_freq) + return 0; + + new_period = watchdog_thresh * max_cpu_freq; + hardlockup_detector_perf_adjust_period(cpu, new_period); + + return 0; +} + +static int watchdog_freq_notifier_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = data; + int cpu; + + if (val != CPUFREQ_CREATE_POLICY) + return NOTIFY_DONE; + + /* + * Let each online CPU related to the policy update the period by their + * own. This will serialize with the framework on start/stop the lockup + * detector (softlockup_{start,stop}_all) and avoid potential race + * condition. Otherwise we may have below theoretical race condition: + * (core 0/1 share the same policy) + * [core 0] [core 1] + * hardlockup_detector_event_create() + * hw_nmi_get_sample_period() + * (cpufreq registered, notifier callback invoked) + * watchdog_freq_notifier_callback() + * watchdog_perf_update_period() + * (since core 1's event's not yet created, + * the period is not set) + * perf_event_create_kernel_counter() + * (event's period is SAFE_MAX_CPU_FREQ) + */ + for_each_cpu(cpu, policy->cpus) + smp_call_on_cpu(cpu, watchdog_perf_update_period, NULL, false); + + return NOTIFY_DONE; +} + +static struct notifier_block watchdog_freq_notifier = { + .notifier_call = watchdog_freq_notifier_callback, +}; + +static int __init init_watchdog_freq_notifier(void) +{ + return cpufreq_register_notifier(&watchdog_freq_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(init_watchdog_freq_notifier); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0c8c35dd645e4..ea84a61ed5084 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/arch/loongarch/include/asm/bug.h b/arch/loongarch/include/asm/bug.h index f6f254f2c5db5..cad807b100ada 100644 --- a/arch/loongarch/include/asm/bug.h +++ b/arch/loongarch/include/asm/bug.h @@ -20,39 +20,38 @@ #endif #ifndef CONFIG_GENERIC_BUG -#define __BUG_ENTRY(flags) +#define __BUG_ENTRY(cond_str, flags) #else -#define __BUG_ENTRY(flags) \ +#define __BUG_ENTRY(cond_str, flags) \ .pushsection __bug_table, "aw"; \ .align 2; \ 10000: .long 10001f - .; \ - _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ - .short flags; \ + _BUGVERBOSE_LOCATION(WARN_CONDITION_STR(cond_str) __FILE__, __LINE__) \ + .short flags; \ .popsection; \ 10001: #endif -#define ASM_BUG_FLAGS(flags) \ - __BUG_ENTRY(flags) \ +#define ASM_BUG_FLAGS(cond_str, flags) \ + __BUG_ENTRY(cond_str, flags) \ break BRK_BUG; -#define ASM_BUG() ASM_BUG_FLAGS(0) +#define ASM_BUG() ASM_BUG_FLAGS("", 0) -#define __BUG_FLAGS(flags, extra) \ - asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \ - extra); +#define __BUG_FLAGS(cond_str, flags, extra) \ + asm_inline volatile (__stringify(ASM_BUG_FLAGS(cond_str, flags)) extra); -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\ + __BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\ instrumentation_end(); \ } while (0) #define BUG() \ do { \ instrumentation_begin(); \ - __BUG_FLAGS(0, ""); \ + __BUG_FLAGS("", 0, ""); \ unreachable(); \ } while (0) diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index b99fbb388fe03..22b27cd447a1b 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -265,7 +265,7 @@ static void __init arch_reserve_crashkernel(void) return; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base, &low_size, &high); + &crash_size, &crash_base, &low_size, NULL, &high); if (ret) return; diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index fbfe0771317ea..11b9b6b63e19f 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -458,7 +458,7 @@ static void __init mips_parse_crashkernel(void) total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base, - NULL, NULL); + NULL, NULL, NULL); if (ret != 0 || crash_size <= 0) return; diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h index 833555f74ffa7..5aa1623e4f2f1 100644 --- a/arch/parisc/include/asm/bug.h +++ b/arch/parisc/include/asm/bug.h @@ -50,7 +50,7 @@ #endif #ifdef CONFIG_DEBUG_BUGVERBOSE -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ @@ -61,12 +61,12 @@ "\t.short %1, %2\n" \ "\t.blockz %3-2*4-2*2\n" \ "\t.popsection" \ - : : "i" (__FILE__), "i" (__LINE__), \ + : : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \ "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry)) ); \ } while(0) #else -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 1db485aacbd9b..171b6b2ba1003 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -51,11 +51,11 @@ ".previous\n" #endif -#define BUG_ENTRY(insn, flags, ...) \ +#define BUG_ENTRY(cond_str, insn, flags, ...) \ __asm__ __volatile__( \ "1: " insn "\n" \ _EMIT_BUG_ENTRY \ - : : "i" (__FILE__), "i" (__LINE__), \ + : : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry)), \ ##__VA_ARGS__) @@ -67,12 +67,12 @@ */ #define BUG() do { \ - BUG_ENTRY("twi 31, 0, 0", 0); \ + BUG_ENTRY("", "twi 31, 0, 0", 0); \ unreachable(); \ } while (0) #define HAVE_ARCH_BUG -#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) +#define __WARN_FLAGS(cond_str, flags) BUG_ENTRY(cond_str, "twi 31, 0, 0", BUGFLAG_WARNING | (flags)) #ifdef CONFIG_PPC64 #define BUG_ON(x) do { \ @@ -80,7 +80,7 @@ if (x) \ BUG(); \ } else { \ - BUG_ENTRY(PPC_TLNEI " %4, 0", 0, "r" ((__force long)(x))); \ + BUG_ENTRY(#x, PPC_TLNEI " %4, 0", 0, "r" ((__force long)(x))); \ } \ } while (0) @@ -90,7 +90,7 @@ if (__ret_warn_on) \ __WARN(); \ } else { \ - BUG_ENTRY(PPC_TLNEI " %4, 0", \ + BUG_ENTRY(#x, PPC_TLNEI " %4, 0", \ BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ "r" (__ret_warn_on)); \ } \ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8ca49e40c473e..28cab25d5b336 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -333,7 +333,7 @@ static __init u64 fadump_calculate_reserve_size(void) * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &size, &base, NULL, NULL); + &size, &base, NULL, NULL, NULL); if (ret == 0 && size > 0) { unsigned long max_size; diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 00e9c267b912f..d1a2d755381ca 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -110,7 +110,7 @@ void __init arch_reserve_crashkernel(void) /* use common parsing */ ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size, - &crash_base, NULL, NULL); + &crash_base, NULL, NULL, NULL); if (ret) return; diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 5c8d1bb98b3e8..5e4897daaaeae 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size) int ret; ret = parse_crashkernel(boot_command_line, size, &crash_size, - &crash_base, NULL, NULL); + &crash_base, NULL, NULL, NULL); if (ret != 0 || crash_size == 0) return; if (crash_base == 0) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 36061f4732b74..6ff7dd3056397 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -96,6 +96,7 @@ config RISCV select CLINT_TIMER if RISCV_M_MODE select CLONE_BACKWARDS select COMMON_CLK + select CPU_NO_EFFICIENT_FFS if !RISCV_ISA_ZBB select CPU_PM if CPU_IDLE || HIBERNATION || SUSPEND select EDAC_SUPPORT select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE) diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index 1aaea81fb1413..da9b8e83934d4 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -50,7 +50,7 @@ typedef u32 bug_insn_t; #endif #ifdef CONFIG_GENERIC_BUG -#define __BUG_FLAGS(flags) \ +#define __BUG_FLAGS(cond_str, flags) \ do { \ __asm__ __volatile__ ( \ "1:\n\t" \ @@ -61,22 +61,22 @@ do { \ ".org 2b + %3\n\t" \ ".popsection" \ : \ - : "i" (__FILE__), "i" (__LINE__), \ + : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) #else /* CONFIG_GENERIC_BUG */ -#define __BUG_FLAGS(flags) do { \ +#define __BUG_FLAGS(cond_str, flags) do { \ __asm__ __volatile__ ("ebreak\n"); \ } while (0) #endif /* CONFIG_GENERIC_BUG */ #define BUG() do { \ - __BUG_FLAGS(0); \ + __BUG_FLAGS("", 0); \ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags)) +#define __WARN_FLAGS(cond_str, flags) __BUG_FLAGS(cond_str, BUGFLAG_WARNING|(flags)) #define HAVE_ARCH_BUG diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f7c9a1caa83e6..785c7104fde75 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include @@ -361,6 +363,9 @@ void __init setup_arch(char **cmdline_p) riscv_user_isa_enable(); riscv_spinlock_init(); + + if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB) || !riscv_isa_extension_available(NULL, ZBB)) + static_branch_disable(&efficient_ffs_key); } bool arch_cpu_is_hotpluggable(int cpu) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8d0374d7ce8ed..15683ae13fa5d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1408,7 +1408,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, NULL, &high); if (ret) return; diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index c500d45fb465c..837bfbde0c518 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -8,26 +8,24 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE -#define __EMIT_BUG(x) do { \ +#define __EMIT_BUG(cond_str, x) do { \ asm_inline volatile( \ "0: mc 0,0\n" \ - ".section .rodata.str,\"aMS\",@progbits,1\n" \ - "1: .asciz \""__FILE__"\"\n" \ - ".previous\n" \ ".section __bug_table,\"aw\"\n" \ - "2: .long 0b-.\n" \ - " .long 1b-.\n" \ - " .short %0,%1\n" \ - " .org 2b+%2\n" \ + "1: .long 0b-.\n" \ + " .long %0-.\n" \ + " .short %1,%2\n" \ + " .org 1b+%3\n" \ ".previous\n" \ - : : "i" (__LINE__), \ + : : "i" (WARN_CONDITION_STR(cond_str) __FILE__),\ + "i" (__LINE__), \ "i" (x), \ "i" (sizeof(struct bug_entry))); \ } while (0) #else /* CONFIG_DEBUG_BUGVERBOSE */ -#define __EMIT_BUG(x) do { \ +#define __EMIT_BUG(cond_str, x) do { \ asm_inline volatile( \ "0: mc 0,0\n" \ ".section __bug_table,\"aw\"\n" \ @@ -42,12 +40,12 @@ #endif /* CONFIG_DEBUG_BUGVERBOSE */ #define BUG() do { \ - __EMIT_BUG(0); \ + __EMIT_BUG("", 0); \ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) do { \ - __EMIT_BUG(BUGFLAG_WARNING|(flags)); \ +#define __WARN_FLAGS(cond_str, flags) do { \ + __EMIT_BUG(cond_str, BUGFLAG_WARNING|(flags)); \ } while (0) #define WARN_ON(x) ({ \ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index f244c5560e7f6..b99aeb0db2ee3 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -605,7 +605,7 @@ static void __init reserve_crashkernel(void) int rc; rc = parse_crashkernel(boot_command_line, ident_map_size, - &crash_size, &crash_base, NULL, NULL); + &crash_size, &crash_base, NULL, NULL, NULL); crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h index 05a485c4fabcf..891276687355c 100644 --- a/arch/sh/include/asm/bug.h +++ b/arch/sh/include/asm/bug.h @@ -52,14 +52,14 @@ do { \ unreachable(); \ } while (0) -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ __asm__ __volatile__ ( \ "1:\t.short %O0\n" \ _EMIT_BUG_ENTRY \ : \ : "n" (TRAPA_BUG_OPCODE), \ - "i" (__FILE__), \ + "i" (WARN_CONDITION_STR(cond_str) __FILE__), \ "i" (__LINE__), \ "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry))); \ diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index 8321b31d2e19d..37073ca1e0ad3 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -146,7 +146,7 @@ void __init reserve_crashkernel(void) return; ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base, NULL, NULL); + &crash_size, &crash_base, NULL, NULL, NULL); if (ret == 0 && crash_size > 0) { crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 640fcac3af745..3f9fb3698d669 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -71,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|_e\?sbat\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f4f7b22d81137..3a38fdcdb9bd3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -106,6 +106,11 @@ vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a +vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o + +ifdef CONFIG_EFI_SBAT +$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE) +endif $(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S new file mode 100644 index 0000000000000..838f70a997ddf --- /dev/null +++ b/arch/x86/boot/compressed/sbat.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Embed SBAT data in the kernel. + */ + .pushsection ".sbat", "a", @progbits + .incbin CONFIG_EFI_SBAT_FILE + .popsection diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 3b2bc61c9408e..587ce3e7c5048 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -43,6 +43,14 @@ SECTIONS *(.rodata.*) _erodata = . ; } +#ifdef CONFIG_EFI_SBAT + .sbat : ALIGN(0x1000) { + _sbat = . ; + *(.sbat) + _esbat = ALIGN(0x1000); + . = _esbat; + } +#endif .data : ALIGN(0x1000) { _data = . ; *(.data) diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index e1f4fd5bc8eeb..9bea5a1e2c52c 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -179,15 +179,11 @@ pecompat_fstart: #else .set pecompat_fstart, setup_size #endif - .ascii ".text" - .byte 0 - .byte 0 - .byte 0 - .long ZO__data - .long setup_size - .long ZO__data # Size of initialized data - # on disk - .long setup_size + .ascii ".text\0\0\0" + .long textsize # VirtualSize + .long setup_size # VirtualAddress + .long textsize # SizeOfRawData + .long setup_size # PointerToRawData .long 0 # PointerToRelocations .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations @@ -196,6 +192,23 @@ pecompat_fstart: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_EXECUTE # Characteristics +#ifdef CONFIG_EFI_SBAT + .ascii ".sbat\0\0\0" + .long ZO__esbat - ZO__sbat # VirtualSize + .long setup_size + ZO__sbat # VirtualAddress + .long ZO__esbat - ZO__sbat # SizeOfRawData + .long setup_size + ZO__sbat # PointerToRawData + + .long 0, 0, 0 + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_DISCARDABLE # Characteristics + + .set textsize, ZO__sbat +#else + .set textsize, ZO__data +#endif + .ascii ".data\0\0\0" .long ZO__end - ZO__data # VirtualSize .long setup_size + ZO__data # VirtualAddress diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b6db4e0b936bf..8375ca7fbd8a6 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1389,16 +1389,16 @@ int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, } EXPORT_SYMBOL_GPL(snp_issue_svsm_attest_req); -static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_data *input, - struct snp_guest_request_ioctl *rio) +static int snp_issue_guest_request(struct snp_guest_req *req) { + struct snp_req_data *input = &req->input; struct ghcb_state state; struct es_em_ctxt ctxt; unsigned long flags; struct ghcb *ghcb; int ret; - rio->exitinfo2 = SEV_RET_NO_FW_CALL; + req->exitinfo2 = SEV_RET_NO_FW_CALL; /* * __sev_get_ghcb() needs to run with IRQs disabled because it is using @@ -1423,8 +1423,8 @@ static int snp_issue_guest_request(struct snp_guest_req *req, struct snp_req_dat if (ret) goto e_put; - rio->exitinfo2 = ghcb->save.sw_exit_info_2; - switch (rio->exitinfo2) { + req->exitinfo2 = ghcb->save.sw_exit_info_2; + switch (req->exitinfo2) { case 0: break; @@ -1919,8 +1919,7 @@ static int enc_payload(struct snp_msg_desc *mdesc, u64 seqno, struct snp_guest_r return 0; } -static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) +static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req) { unsigned long req_start = jiffies; unsigned int override_npages = 0; @@ -1934,7 +1933,7 @@ static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_r * sequence number must be incremented or the VMPCK must be deleted to * prevent reuse of the IV. */ - rc = snp_issue_guest_request(req, &req->input, rio); + rc = snp_issue_guest_request(req); switch (rc) { case -ENOSPC: /* @@ -1987,7 +1986,7 @@ static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_r snp_inc_msg_seqno(mdesc); if (override_err) { - rio->exitinfo2 = override_err; + req->exitinfo2 = override_err; /* * If an extended guest request was issued and the supplied certificate @@ -2005,12 +2004,20 @@ static int __handle_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_r return rc; } -int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req) { u64 seqno; int rc; + /* + * enc_payload() calls aesgcm_encrypt(), which can potentially offload to HW. + * The offload's DMA SG list of data to encrypt has to be in linear mapping. + */ + if (!virt_addr_valid(req->req_buf) || !virt_addr_valid(req->resp_buf)) { + pr_warn("AES-GSM buffers must be in linear mapping"); + return -EINVAL; + } + guard(mutex)(&snp_cmd_mutex); /* Check if the VMPCK is not empty */ @@ -2043,14 +2050,14 @@ int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req req->input.resp_gpa = __pa(mdesc->response); req->input.data_gpa = req->certs_data ? __pa(req->certs_data) : 0; - rc = __handle_guest_request(mdesc, req, rio); + rc = __handle_guest_request(mdesc, req); if (rc) { if (rc == -EIO && - rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) + req->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN)) return rc; pr_alert("Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n", - rc, rio->exitinfo2); + rc, req->exitinfo2); snp_disable_vmpck(mdesc); return rc; @@ -2069,11 +2076,10 @@ EXPORT_SYMBOL_GPL(snp_send_guest_request); static int __init snp_get_tsc_info(void) { - struct snp_guest_request_ioctl *rio; struct snp_tsc_info_resp *tsc_resp; struct snp_tsc_info_req *tsc_req; struct snp_msg_desc *mdesc; - struct snp_guest_req *req; + struct snp_guest_req req = {}; int rc = -ENOMEM; tsc_req = kzalloc(sizeof(*tsc_req), GFP_KERNEL); @@ -2089,32 +2095,24 @@ static int __init snp_get_tsc_info(void) if (!tsc_resp) goto e_free_tsc_req; - req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - goto e_free_tsc_resp; - - rio = kzalloc(sizeof(*rio), GFP_KERNEL); - if (!rio) - goto e_free_req; - mdesc = snp_msg_alloc(); if (IS_ERR_OR_NULL(mdesc)) - goto e_free_rio; + goto e_free_tsc_resp; rc = snp_msg_init(mdesc, snp_vmpl); if (rc) goto e_free_mdesc; - req->msg_version = MSG_HDR_VER; - req->msg_type = SNP_MSG_TSC_INFO_REQ; - req->vmpck_id = snp_vmpl; - req->req_buf = tsc_req; - req->req_sz = sizeof(*tsc_req); - req->resp_buf = (void *)tsc_resp; - req->resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN; - req->exit_code = SVM_VMGEXIT_GUEST_REQUEST; + req.msg_version = MSG_HDR_VER; + req.msg_type = SNP_MSG_TSC_INFO_REQ; + req.vmpck_id = snp_vmpl; + req.req_buf = tsc_req; + req.req_sz = sizeof(*tsc_req); + req.resp_buf = (void *)tsc_resp; + req.resp_sz = sizeof(*tsc_resp) + AUTHTAG_LEN; + req.exit_code = SVM_VMGEXIT_GUEST_REQUEST; - rc = snp_send_guest_request(mdesc, req, rio); + rc = snp_send_guest_request(mdesc, &req); if (rc) goto e_request; @@ -2135,11 +2133,7 @@ static int __init snp_get_tsc_info(void) memzero_explicit(tsc_resp, sizeof(*tsc_resp) + AUTHTAG_LEN); e_free_mdesc: snp_msg_free(mdesc); -e_free_rio: - kfree(rio); -e_free_req: - kfree(req); - e_free_tsc_resp: +e_free_tsc_resp: kfree(tsc_resp); e_free_tsc_req: kfree(tsc_req); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 7cd2f395f301e..bd182325ebb46 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -27,10 +27,10 @@ CONFIG_CGROUP_DEBUG=y CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y CONFIG_PROFILING=y +CONFIG_KEXEC=y CONFIG_SMP=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y -CONFIG_NR_CPUS=8 CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y @@ -39,9 +39,6 @@ CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_HZ_1000=y -CONFIG_KEXEC=y -CONFIG_CRASH_DUMP=y -# CONFIG_MITIGATION_RETHUNK is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y @@ -52,7 +49,6 @@ CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_X86_ACPI_CPUFREQ=y CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y -CONFIG_COMPAT_32BIT_TIME=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -63,9 +59,7 @@ CONFIG_BINFMT_MISC=y # CONFIG_COMPAT_BRK is not set CONFIG_NET=y CONFIG_PACKET=y -CONFIG_UNIX=y CONFIG_XFRM_USER=y -CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y @@ -134,7 +128,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y -CONFIG_EFI_CAPSULE_LOADER=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_SD=y @@ -210,7 +203,6 @@ CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y CONFIG_HID_GYRATION=y -CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y CONFIG_HID_PANTHERLORD=y CONFIG_PANTHERLORD_FF=y @@ -241,7 +233,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set CONFIG_QFMT_V2=y CONFIG_AUTOFS_FS=y CONFIG_ISO9660_FS=y @@ -266,19 +257,13 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_PRINTK_TIME=y CONFIG_DEBUG_KERNEL=y -CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_WX=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_UNWINDER_FRAME_POINTER=y CONFIG_DEBUG_ENTRY=y -# CONFIG_64BIT is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 61e25f6209ed4..7d7310cdf8b0a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -27,6 +27,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_BLK_DEV_INITRD=y CONFIG_KALLSYMS_ALL=y CONFIG_PROFILING=y +CONFIG_KEXEC=y CONFIG_SMP=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y @@ -40,8 +41,6 @@ CONFIG_EFI=y CONFIG_EFI_STUB=y CONFIG_EFI_MIXED=y CONFIG_HZ_1000=y -CONFIG_KEXEC=y -CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y @@ -63,9 +62,7 @@ CONFIG_BINFMT_MISC=y # CONFIG_COMPAT_BRK is not set CONFIG_NET=y CONFIG_PACKET=y -CONFIG_UNIX=y CONFIG_XFRM_USER=y -CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y CONFIG_IP_MULTIPLE_TABLES=y @@ -205,7 +202,6 @@ CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y CONFIG_HID_GYRATION=y -CONFIG_LOGITECH_FF=y CONFIG_HID_NTRIG=y CONFIG_HID_PANTHERLORD=y CONFIG_PANTHERLORD_FF=y @@ -239,7 +235,6 @@ CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y CONFIG_QUOTA=y CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set CONFIG_QFMT_V2=y CONFIG_AUTOFS_FS=y CONFIG_ISO9660_FS=y @@ -264,13 +259,11 @@ CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_PRINTK_TIME=y CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_WX=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cfb5ca41e30de..9fd1291e7bdfb 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index f0e9acf725471..8593976b32cbb 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -39,7 +39,7 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUG_FLAGS(ins, flags, extra) \ +#define _BUG_FLAGS(cond_str, ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ @@ -50,14 +50,14 @@ do { \ "\t.org 2b+%c3\n" \ ".popsection\n" \ extra \ - : : "i" (__FILE__), "i" (__LINE__), \ + : : "i" (WARN_CONDITION_STR(cond_str) __FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) #else /* !CONFIG_DEBUG_BUGVERBOSE */ -#define _BUG_FLAGS(ins, flags, extra) \ +#define _BUG_FLAGS(cond_str, ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ @@ -74,7 +74,7 @@ do { \ #else -#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins) +#define _BUG_FLAGS(cond_str, ins, flags, extra) asm volatile(ins) #endif /* CONFIG_GENERIC_BUG */ @@ -82,7 +82,7 @@ do { \ #define BUG() \ do { \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, 0, ""); \ + _BUG_FLAGS("", ASM_UD2, 0, ""); \ __builtin_unreachable(); \ } while (0) @@ -92,11 +92,11 @@ do { \ * were to trigger, we'd rather wreck the machine in an attempt to get the * message out than not know about it. */ -#define __WARN_FLAGS(flags) \ +#define __WARN_FLAGS(cond_str, flags) \ do { \ __auto_type __flags = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \ + _BUG_FLAGS(cond_str, ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \ instrumentation_end(); \ } while (0) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ee176236c2be9..b78af55aa22e2 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -457,9 +457,12 @@ #define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ + #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ +#define X86_FEATURE_GP_ON_USER_CPUID (20*32+17) /* User CPUID faulting */ + #define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 363110e6b2e33..a2c1f2d24b645 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -9,6 +9,14 @@ #include #include +/* + * Define bits that are always set to 1 in DR7, only bit 10 is + * architecturally reserved to '1'. + * + * This is also the init/reset value for DR7. + */ +#define DR7_FIXED_1 0x00000400 + DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL @@ -100,8 +108,8 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) static inline void hw_breakpoint_disable(void) { - /* Zero the control register for HW Breakpoint */ - set_debugreg(0UL, 7); + /* Reset the control register for HW Breakpoint */ + set_debugreg(DR7_FIXED_1, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); @@ -125,9 +133,12 @@ static __always_inline unsigned long local_db_save(void) return 0; get_debugreg(dr7, 7); - dr7 &= ~0x400; /* architecturally set bit */ + + /* Architecturally set bit */ + dr7 &= ~DR7_FIXED_1; if (dr7) - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); + /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 1c94121acd3da..93e99d2583d6f 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -118,7 +118,7 @@ enum xfeature { XFEATURE_PKRU, XFEATURE_PASID, XFEATURE_CET_USER, - XFEATURE_CET_KERNEL_UNUSED, + XFEATURE_CET_KERNEL, XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, @@ -142,7 +142,7 @@ enum xfeature { #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) #define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER) -#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED) +#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) @@ -268,6 +268,16 @@ struct cet_user_state { u64 user_ssp; }; +/* + * State component 12 is Control-flow Enforcement supervisor states. + * This state includes SSP pointers for privilege levels 0 through 2. + */ +struct cet_supervisor_state { + u64 pl0_ssp; + u64 pl1_ssp; + u64 pl2_ssp; +} __packed; + /* * State component 15: Architectural LBR configuration state. * The size of Arch LBR state depends on the number of LBRs (lbr_depth). @@ -551,6 +561,31 @@ struct fpu_guest { struct fpstate *fpstate; }; +/* + * FPU state configuration data for fpu_guest. + * Initialized at boot time. Read only after init. + */ +struct vcpu_fpu_config { + /* + * @size: + * + * The default size of the register state buffer in guest FPUs. + * Includes all supported features except independent managed + * features and features which have to be requested by user space + * before usage. + */ + unsigned int size; + + /* + * @features: + * + * The default supported features bitmap in guest FPUs. Does not + * include independent managed features and features which have to + * be requested by user space before usage. + */ + u64 features; +}; + /* * FPU state configuration data. Initialized at boot time. Read only after init. */ @@ -567,8 +602,9 @@ struct fpu_state_config { * @default_size: * * The default size of the register state buffer. Includes all - * supported features except independent managed features and - * features which have to be requested by user space before usage. + * supported features except independent managed features, + * guest-only features and features which have to be requested by + * user space before usage. */ unsigned int default_size; @@ -584,8 +620,8 @@ struct fpu_state_config { * @default_features: * * The default supported features bitmap. Does not include - * independent managed features and features which have to - * be requested by user space before usage. + * independent managed features, guest-only features and features + * which have to be requested by user space before usage. */ u64 default_features; /* @@ -606,5 +642,6 @@ struct fpu_state_config { /* FPU state configuration information */ extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg; +extern struct vcpu_fpu_config guest_default_cfg; #endif /* _ASM_X86_FPU_TYPES_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index b308a76afbb7c..7a7dc9d560275 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -46,9 +46,13 @@ /* Features which are dynamically enabled for a process on request */ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA +/* Supervisor features which are enabled only in guest FPUs */ +#define XFEATURE_MASK_GUEST_SUPERVISOR XFEATURE_MASK_CET_KERNEL + /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ - XFEATURE_MASK_CET_USER) + XFEATURE_MASK_CET_USER | \ + XFEATURE_MASK_GUEST_SUPERVISOR) /* * A supervisor state component may not always contain valuable information, @@ -75,8 +79,7 @@ * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ -#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ - XFEATURE_MASK_CET_KERNEL) +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b4a391929cdba..639d9bcee8424 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -249,7 +250,6 @@ enum x86_intercept_stage; #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) -#define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff #define KVM_GUESTDBG_VALID_MASK \ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b7dded3c81132..ff7e9743250fd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -830,6 +830,7 @@ #define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) #define MSR_K7_HWCR_IRPERF_EN_BIT 30 #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) +#define MSR_K7_HWCR_CPUID_USER_DIS_BIT 35 #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 #define MSR_K7_HWCR_CPB_DIS_BIT 25 diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 58e028d42e41d..fbb616fcbfb8d 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -231,6 +231,7 @@ struct snp_guest_req { size_t resp_sz; u64 exit_code; + u64 exitinfo2; unsigned int vmpck_id; u8 msg_version; u8 msg_type; @@ -486,8 +487,6 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } -struct snp_guest_request_ioctl; - void setup_ghcb(void); void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages); @@ -513,8 +512,7 @@ void snp_kexec_begin(void); int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id); struct snp_msg_desc *snp_msg_alloc(void); void snp_msg_free(struct snp_msg_desc *mdesc); -int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio); +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req); int snp_svsm_vtpm_send_command(u8 *buffer); @@ -587,8 +585,8 @@ static inline void snp_kexec_begin(void) { } static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; } static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; } static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } -static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, - struct snp_guest_request_ioctl *rio) { return -ENODEV; } +static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, + struct snp_guest_req *req) { return -ENODEV; } static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a75..1ee2e5115955c 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h index 0007ba077c0c2..41da492dfb01f 100644 --- a/arch/x86/include/uapi/asm/debugreg.h +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -15,7 +15,26 @@ which debugging register was responsible for the trap. The other bits are either reserved or not of interest to us. */ -/* Define reserved bits in DR6 which are always set to 1 */ +/* + * Define bits in DR6 which are set to 1 by default. + * + * This is also the DR6 architectural value following Power-up, Reset or INIT. + * + * Note, with the introduction of Bus Lock Detection (BLD) and Restricted + * Transactional Memory (RTM), the DR6 register has been modified: + * + * 1) BLD flag (bit 11) is no longer reserved to 1 if the CPU supports + * Bus Lock Detection. The assertion of a bus lock could clear it. + * + * 2) RTM flag (bit 16) is no longer reserved to 1 if the CPU supports + * restricted transactional memory. #DB occurred inside an RTM region + * could clear it. + * + * Apparently, DR6.BLD and DR6.RTM are active low bits. + * + * As a result, DR6_RESERVED is an incorrect name now, but it is kept for + * compatibility. + */ #define DR6_RESERVED (0xFFFF0FF0) #define DR_TRAP0 (0x1) /* db0 */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b2ad8d13211ac..f18f540db58ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -489,6 +489,10 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } bsp_determine_snp(c); + + if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID)) + setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT); + return; warn: diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 7f94e6a5497d9..bdef2c9aa1b8b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -113,10 +113,9 @@ void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; static void __init set_return_thunk(void *thunk) { - if (x86_return_thunk != __x86_return_thunk) - pr_warn("x86/bugs: return thunk changed\n"); - x86_return_thunk = thunk; + + pr_info("active return thunk: %ps\n", thunk); } /* Update SPEC_CTRL MSR and its cached copy unconditionally */ @@ -1120,6 +1119,20 @@ early_param("nospectre_v1", nospectre_v1_cmdline); enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +/* Depends on spectre_v2 mitigation selected already */ +static inline bool cdt_possible(enum spectre_v2_mitigation mode) +{ + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) || + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) + return false; + + if (mode == SPECTRE_V2_RETPOLINE || + mode == SPECTRE_V2_EIBRS_RETPOLINE) + return true; + + return false; +} + #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt @@ -1247,6 +1260,14 @@ static void __init retbleed_select_mitigation(void) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; else retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + /* Final mitigation depends on spectre-v2 selection */ + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + else if (boot_cpu_has(X86_FEATURE_IBRS)) + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } } @@ -1255,27 +1276,16 @@ static void __init retbleed_update_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) return; - if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) - goto out; - - /* - * retbleed=stuff is only allowed on Intel. If stuffing can't be used - * then a different mitigation will be selected below. - * - * its=stuff will also attempt to enable stuffing. - */ - if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF || - its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) { - if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) { - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - retbleed_mitigation = RETBLEED_MITIGATION_AUTO; - } else { - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_info("Retbleed mitigation updated to stuffing\n"); + /* ITS can also enable stuffing */ + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; - } + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF && + !cdt_possible(spectre_v2_enabled)) { + pr_err("WARNING: retbleed=stuff depends on retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } + /* * Let IBRS trump all on Intel without affecting the effects of the * retbleed= cmdline option except for call depth based stuffing @@ -1294,15 +1304,11 @@ static void __init retbleed_update_mitigation(void) if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) pr_err(RETBLEED_INTEL_MSG); } - /* If nothing has set the mitigation yet, default to NONE. */ - if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO) - retbleed_mitigation = RETBLEED_MITIGATION_NONE; } -out: + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } - static void __init retbleed_apply_mitigation(void) { bool mitigate_smt = false; @@ -1449,6 +1455,7 @@ static void __init its_update_mitigation(void) its_mitigation = ITS_MITIGATION_OFF; break; case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: /* Retpoline+CDT mitigates ITS */ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; @@ -1462,13 +1469,8 @@ static void __init its_update_mitigation(void) break; } - /* - * retbleed_update_mitigation() will try to do stuffing if its=stuff. - * If it can't, such as if spectre_v2!=retpoline, then fall back to - * aligned thunks. - */ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && - retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + !cdt_possible(spectre_v2_enabled)) its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; pr_info("%s\n", its_strings[its_mitigation]); @@ -1476,15 +1478,24 @@ static void __init its_update_mitigation(void) static void __init its_apply_mitigation(void) { - /* its=stuff forces retbleed stuffing and is enabled there. */ - if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS) - return; - - if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) - setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + switch (its_mitigation) { + case ITS_MITIGATION_OFF: + case ITS_MITIGATION_AUTO: + case ITS_MITIGATION_VMEXIT_ONLY: + break; + case ITS_MITIGATION_ALIGNED_THUNKS: + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - set_return_thunk(its_return_thunk); + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); + break; + case ITS_MITIGATION_RETPOLINE_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + set_return_thunk(call_depth_return_thunk); + break; + } } #undef pr_fmt diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8feb8fd2957ad..325f892ee42de 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -2243,20 +2244,16 @@ EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif -/* - * Clear all 6 debug registers: - */ -static void clear_all_debug_regs(void) +static void initialize_debug_regs(void) { - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - - set_debugreg(0, i); - } + /* Control register first -- to make sure everything is disabled. */ + set_debugreg(DR7_FIXED_1, 7); + set_debugreg(DR6_RESERVED, 6); + /* dr5 and dr4 don't exist */ + set_debugreg(0, 3); + set_debugreg(0, 2); + set_debugreg(0, 1); + set_debugreg(0, 0); } #ifdef CONFIG_KGDB @@ -2417,7 +2414,7 @@ void cpu_init(void) load_mm_ldt(&init_mm); - clear_all_debug_regs(); + initialize_debug_regs(); dbg_restore_debug_regs(); doublefault_init_cpu_tss(); @@ -2529,6 +2526,12 @@ void __init arch_cpu_finalize_init(void) fpu__init_system(); fpu__init_cpu(); + /* + * This needs to follow the FPU initializtion, since EFI depends on it. + */ + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); + /* * Ensure that access to the per CPU representation has the initial * boot CPU configuration. diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index bcb534688dfe7..c6b12bed173df 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -163,10 +163,10 @@ static struct crash_mem *fill_up_crash_elf_data(void) return NULL; /* - * Exclusion of crash region and/or crashk_low_res may cause - * another range split. So add extra two slots here. + * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges + * may cause range splits. So add extra slots here. */ - nr_ranges += 2; + nr_ranges += 2 + crashk_cma_cnt; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; @@ -184,6 +184,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) static int elf_header_exclude_ranges(struct crash_mem *cmem) { int ret = 0; + int i; /* Exclude the low 1M because it is always reserved */ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1); @@ -198,8 +199,17 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) if (crashk_low_res.end) ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; - return ret; + for (i = 0; i < crashk_cma_cnt; ++i) { + ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + if (ret) + return ret; + } + + return 0; } static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) @@ -374,6 +384,14 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) add_e820_entry(params, &ei); } + for (i = 0; i < crashk_cma_cnt; ++i) { + ei.addr = crashk_cma_ranges[i].start; + ei.size = crashk_cma_ranges[i].end - + crashk_cma_ranges[i].start + 1; + ei.type = E820_TYPE_RAM; + add_e820_entry(params, &ei); + } + out: vfree(cmem); return ret; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ea138583dd92a..aefd412a23dc2 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -37,6 +37,7 @@ DEFINE_PER_CPU(u64, xfd_state); /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; +struct vcpu_fpu_config guest_default_cfg __ro_after_init; /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, @@ -217,7 +218,7 @@ void fpu_reset_from_exception_fixup(void) } #if IS_ENABLED(CONFIG_KVM) -static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); +static void __fpstate_reset(struct fpstate *fpstate); static void fpu_lock_guest_permissions(void) { @@ -242,19 +243,21 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) struct fpstate *fpstate; unsigned int size; - size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + size = guest_default_cfg.size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); if (!fpstate) return false; - /* Leave xfd to 0 (the reset value defined by spec) */ - __fpstate_reset(fpstate, 0); - fpstate_init_user(fpstate); + /* Initialize indicators to reflect properties of the fpstate */ fpstate->is_valloc = true; fpstate->is_guest = true; + __fpstate_reset(fpstate); + fpstate_init_user(fpstate); + gfpu->fpstate = fpstate; - gfpu->xfeatures = fpu_kernel_cfg.default_features; + gfpu->xfeatures = guest_default_cfg.features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state @@ -541,28 +544,50 @@ void fpstate_init_user(struct fpstate *fpstate) fpstate_init_fstate(fpstate); } -static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) +static void __fpstate_reset(struct fpstate *fpstate) { - /* Initialize sizes and feature masks */ - fpstate->size = fpu_kernel_cfg.default_size; + /* + * Supervisor features (and thus sizes) may diverge between guest + * FPUs and host FPUs, as some supervisor features are supported + * for guests despite not being utilized by the host. User + * features and sizes are always identical, which allows for + * common guest and userspace ABI. + * + * For the host, set XFD to the kernel's desired initialization + * value. For guests, set XFD to its architectural RESET value. + */ + if (fpstate->is_guest) { + fpstate->size = guest_default_cfg.size; + fpstate->xfeatures = guest_default_cfg.features; + fpstate->xfd = 0; + } else { + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->xfd = init_fpstate.xfd; + } + fpstate->user_size = fpu_user_cfg.default_size; - fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; - fpstate->xfd = xfd; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; - __fpstate_reset(fpu->fpstate, init_fpstate.xfd); + __fpstate_reset(fpu->fpstate); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; - /* Same defaults for guests */ - fpu->guest_perm = fpu->perm; + + fpu->guest_perm.__state_perm = guest_default_cfg.features; + fpu->guest_perm.__state_size = guest_default_cfg.size; + /* + * User features and sizes are always identical between host and + * guest FPUs, which allows for common guest and userspace ABI. + */ + fpu->guest_perm.__user_state_size = fpu_user_cfg.default_size; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 99db41bf9fa6b..ff988b9ea39f0 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -205,6 +205,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_kernel_cfg.default_size = size; fpu_user_cfg.max_size = size; fpu_user_cfg.default_size = size; + guest_default_cfg.size = size; } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9aa9ac8399aee..12ed75c1b5675 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -57,7 +57,7 @@ static const char *xfeature_names[] = "Protection Keys User registers", "PASID state", "Control-flow User registers", - "Control-flow Kernel registers (unused)", + "Control-flow Kernel registers (KVM only)", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", @@ -81,6 +81,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, + [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, [XFEATURE_APX] = X86_FEATURE_APX, @@ -372,6 +373,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ + XFEATURE_MASK_CET_KERNEL | \ XFEATURE_MASK_XTILE | \ XFEATURE_MASK_APX) @@ -573,6 +575,7 @@ static bool __init check_xstate_against_struct(int nr) case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state); case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: @@ -743,6 +746,9 @@ static int __init init_xstate_size(void) fpu_user_cfg.default_size = xstate_calculate_size(fpu_user_cfg.default_features, false); + guest_default_cfg.size = + xstate_calculate_size(guest_default_cfg.features, compacted); + return 0; } @@ -763,6 +769,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) fpu_kernel_cfg.default_size = legacy_size; fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; + guest_default_cfg.size = legacy_size; /* * Prevent enabling the static branch which enables writes to the @@ -773,6 +780,24 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) fpstate_reset(x86_task_fpu(current)); } +static u64 __init host_default_mask(void) +{ + /* + * Exclude dynamic features (require userspace opt-in) and features + * that are supported only for KVM guests. + */ + return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR); +} + +static u64 __init guest_default_mask(void) +{ + /* + * Exclude dynamic features, which require userspace opt-in even + * for KVM guests. + */ + return ~(u64)XFEATURE_MASK_USER_DYNAMIC; +} + /* * Enable and initialize the xsave feature. * Called once per system bootup. @@ -855,12 +880,13 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; - /* Clean out dynamic features from default */ - fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; - fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; - - fpu_user_cfg.default_features = fpu_user_cfg.max_features; - fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + /* + * Now, given maximum feature set, determine default values by + * applying default masks. + */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask(); + fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask(); + guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask(); /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 102641fd21728..8b1a9733d13e3 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -385,7 +385,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) struct perf_event *bp; /* Disable hardware debugging while we are in kgdb: */ - set_debugreg(0UL, 7); + set_debugreg(DR7_FIXED_1, 7); for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 704883c21f3a1..7b94851bb37eb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -334,13 +334,21 @@ DEFINE_PER_CPU(u64, msr_misc_features_shadow); static void set_cpuid_faulting(bool on) { - u64 msrval; - msrval = this_cpu_read(msr_misc_features_shadow); - msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; - msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); - this_cpu_write(msr_misc_features_shadow, msrval); - wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + u64 msrval; + + msrval = this_cpu_read(msr_misc_features_shadow); + msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; + msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); + this_cpu_write(msr_misc_features_shadow, msrval); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval); + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (on) + msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT); + else + msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_CPUID_USER_DIS_BIT); + } } static void disable_cpuid(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a10e180cbf233..3ef15c2f152f7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400)) + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1)) return; printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8d6cf25127aab..b972bf72fb8b6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -133,7 +133,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400))) { + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) { printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", log_lvl, d0, d1, d2); printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index fb27be6971286..680d1b6dfea41 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -599,7 +599,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) static void __init arch_reserve_crashkernel(void) { - unsigned long long crash_base, crash_size, low_size = 0; + unsigned long long crash_base, crash_size, low_size = 0, cma_size = 0; bool high = false; int ret; @@ -608,7 +608,7 @@ static void __init arch_reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, - &low_size, &high); + &low_size, &cma_size, &high); if (ret) return; @@ -618,6 +618,7 @@ static void __init arch_reserve_crashkernel(void) } reserve_crashkernel_generic(crash_size, crash_base, low_size, high); + reserve_crashkernel_cma(cma_size); } static struct resource standard_io_resources[] = { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c5c897a864180..36354b4705905 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1022,24 +1022,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline unsigned long debug_read_clear_dr6(void) +static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; + get_debugreg(dr6, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + /* * The Intel SDM says: * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. + * Certain debug exceptions may clear bits 0-3 of DR6. + * + * BLD induced #DB clears DR6.BLD and any other debug + * exception doesn't modify DR6.BLD. * - * Keep it simple: clear DR6 immediately. + * RTM induced #DB clears DR6.RTM and any other debug + * exception sets DR6.RTM. + * + * To avoid confusion in identifying debug exceptions, + * debug handlers should set DR6.BLD and DR6.RTM, and + * clear other DR6 bits before returning. + * + * Keep it simple: write DR6 with its architectural reset + * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ - get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); - dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } @@ -1239,13 +1247,13 @@ static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - exc_debug_kernel(regs, debug_read_clear_dr6()); + exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - exc_debug_user(regs, debug_read_clear_dr6()); + exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED @@ -1264,7 +1272,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which - * debug_read_clear_dr6() returns for the IDT entry points. + * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); @@ -1279,7 +1287,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6 = debug_read_clear_dr6(); + unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e8393..bea2d53804528 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -338,7 +339,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); @@ -608,6 +609,607 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +#define __4GB (1UL << 32) +#define MASK_4GB ~(__4GB - 1) +#define PAGE_COUNT(addr) ((addr & ~MASK_4GB) >> PAGE_SHIFT) + +static unsigned long find_nearest_page(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long limit, low_limit = PAGE_SIZE, high_limit = TASK_SIZE; + unsigned long cross_4GB, low_4GB, high_4GB; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + /* + * The idea is to create a trampoline every 4GB, so we need to find + * free page closest to the 4GB alignment. We find intersecting 4GB + * alignment address and search up and down to find the closest free + * page. + */ + + low_4GB = call_end & MASK_4GB; + high_4GB = low_4GB + __4GB; + + /* Restrict limits to be within [PAGE_SIZE,TASK_SIZE] boundary. */ + if (!check_add_overflow(call_end, INT_MIN, &limit)) + low_limit = limit; + if (low_limit == PAGE_SIZE) + low_4GB = low_limit; + + high_limit = call_end + INT_MAX; + if (high_limit > TASK_SIZE) + high_limit = high_4GB = TASK_SIZE; + + /* Get 4GB alligned address that's within (call_end - 2GB, call_end + 2GB). */ + if (low_limit <= low_4GB) + cross_4GB = low_4GB; + else + cross_4GB = high_4GB; + + /* Search up from intersecting 4GB alignment address. */ + info.low_limit = cross_4GB; + info.high_limit = high_limit; + high_tramp = vm_unmapped_area(&info); + + /* Search down from intersecting 4GB alignment address. */ + info.low_limit = low_limit; + info.high_limit = cross_4GB; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + printk("KRAVA find_nearest_page vaddr %lx high_tramp %lx low_tramp %lx cross_4GB %lx\n", + vaddr, high_tramp, low_tramp, cross_4GB); + + /* Return address that's closest to the 4GB alignment address. */ + if (cross_4GB - low_tramp < high_tramp - cross_4GB) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + printk("create_uprobe_trampoline1 vaddr %lx\n", vaddr); + vaddr = find_nearest_page(vaddr); + printk("create_uprobe_trampoline2 tramp %lx\n", vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) + goto free_area; + return tramp; + +free_area: + kfree(tramp); + return NULL; +} + +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + printk("get_uprobe_trampoline1 vaddr %lx\n", vaddr); + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + printk("get_uprobe_trampoline2 vaddr %lx -> tramp %lx\n", vaddr, tramp->vaddr); + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + printk("get_uprobe_trampoline3 vaddr %lx -> tramp %lx\n", vaddr, tramp->vaddr); + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + unsigned long ip, sp, ax_r11_cx_ip[4]; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + goto sigill; + + err = copy_from_user(ax_r11_cx_ip, (void __user *)regs->sp, sizeof(ax_r11_cx_ip)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = ax_r11_cx_ip[0]; + regs->r11 = ax_r11_cx_ip[1]; + regs->cx = ax_r11_cx_ip[2]; + regs->ip = ax_r11_cx_ip[3] - 5; + regs->sp += sizeof(ax_r11_cx_ip); + regs->orig_ax = -1; + + sp = regs->sp; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) + return regs->ax; + + regs->sp -= sizeof(ax_r11_cx_ip); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + ax_r11_cx_ip[0] = regs->ax; + ax_r11_cx_ip[1] = regs->r11; + ax_r11_cx_ip[2] = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (ax_r11_cx_ip[3] - 5 != regs->ip) + ax_r11_cx_ip[3] = regs->ip; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, ax_r11_cx_ip, sizeof(ax_r11_cx_ip)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static ssize_t +bpf_uprobe_trampoline_write(struct file *file, struct kobject *kobj, + const struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + struct mm_struct *mm = current->mm; + unsigned long vaddr = 0; + bool new = false; + + if (kstrtoul(buf, 0, &vaddr)) + return -EINVAL; + + mmap_write_lock(mm); + get_uprobe_trampoline(vaddr, &new); + mmap_write_unlock(mm); + + trace_printk("vaddr %lx new %d\n", vaddr, new); + return strlen(buf); +} + +static struct bin_attribute bin_attr_uprobe_trampoline_file __ro_after_init = { + .attr = { .name = "uprobe_trampoline", .mode = 0666, }, + .write = bpf_uprobe_trampoline_write, +}; + +static int __init arch_uprobes_init(void) +{ + printk("KRAVA TASK_SIZE %lx\n", TASK_SIZE); + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return sysfs_create_bin_file(kernel_kobj, &bin_attr_uprobe_trampoline_file); +} + +late_initcall(arch_uprobes_init); + +enum { + OPT_PART, + OPT_INSN, + UNOPT_INT3, + UNOPT_PART, +}; + +struct write_opcode_ctx { + unsigned long base; + int update; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->update) { + case OPT_PART: + case OPT_INSN: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case UNOPT_INT3: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + case UNOPT_PART: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +static int write_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *ctx) +{ + return uprobe_write(auprobe, vma, vaddr, insn, nbytes, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, ctx); +} + +static void relative_call(void *dest, long from, long to) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *insn; + + insn = (struct __arch_relative_insn *)dest; + insn->raddr = (s32)(to - (from + 5)); + insn->op = CALL_INSN_OPCODE; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + char call[5]; + int err; + + relative_call(call, vaddr, tramp); + + /* + * We are in state where breakpoint (int3) is installed on top of first + * byte of the nop5 instruction. We will do following steps to overwrite + * this to call instruction: + * + * - sync cores + * - write last 4 bytes of the call instruction + * - sync cores + * - update the call instruction opcode + */ + + smp_text_poke_sync_each_cpu(); + + ctx.update = OPT_PART; + err = write_insn(auprobe, vma, vaddr + 1, call + 1, 4, &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + ctx.update = OPT_INSN; + return write_insn(auprobe, vma, vaddr, call, 1, &ctx); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * We need to overwrite call instruction into nop5 instruction with + * breakpoint (int3) installed on top of its first byte. We will: + * + * - overwrite call opcode with breakpoint (int3) + * - sync cores + * - write last 4 bytes of the nop5 instruction + * - sync cores + */ + + ctx.update = UNOPT_INT3; + err = write_insn(auprobe, vma, vaddr, &int3, 1, &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + ctx.update = UNOPT_PART; + err = write_insn(auprobe, vma, vaddr + 1, (uprobe_opcode_t *) auprobe->insn + 1, 4, &ctx); + + smp_text_poke_sync_each_cpu(); + return err; +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); + return 0; +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + bool optimized = false; + int err; + + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + err = is_optimized(vma->vm_mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + struct mm_struct *mm = vma->vm_mm; + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + WARN_ON_ONCE(swbp_unoptimize(auprobe, vma, vaddr)); + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + /* + * Do not optimize if shadow stack is enabled, the return address hijack + * code in arch_uretprobe_hijack_return_addr updates wrong frame when + * the entry uprobe is optimized and the shadow stack crashes the app. + */ + if (shstk_is_enabled()) + return; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return false; + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -621,6 +1223,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -987,6 +1593,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + if (can_optimize(auprobe, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b58a74c1722de..a9d992d5652fa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11035,7 +11035,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.switch_db_regs && !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); @@ -11044,7 +11044,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); } vcpu->arch.host_debugctl = get_debugctlmsr(); diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index 08cd913cbd4e9..8bf15c4aefa9e 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -167,7 +167,7 @@ int main(int argc, char **argv) pr_warn("Decoded and checked %d instructions with %d " "failures\n", insns, warnings); else - fprintf(stdout, "%s: success: Decoded and checked %d" + fprintf(stdout, " %s: success: Decoded and checked %d" " instructions\n", prog, insns); return 0; } diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 213f35f94feb1..e743f0ea01ee4 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -253,9 +253,9 @@ int main(int argc, char **argv) } fprintf((errors) ? stderr : stdout, - "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", + " %s: %s: Decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", prog, - (errors) ? "Failure" : "Success", + (errors) ? "failure" : "success", insns, (input_file) ? "given" : "random", errors, diff --git a/ci/diffs/.keep b/ci/diffs/.keep new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch new file mode 100644 index 0000000000000..3b6139225e7bf --- /dev/null +++ b/ci/diffs/0001-Revert-bpf-Avoid-unnecessary-audit-log-for-CPU-secur.patch @@ -0,0 +1,33 @@ +From 5440a12ac8fb2a8e051c597fcf5d85b427fe612a Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 13 Oct 2023 12:44:34 -0700 +Subject: [PATCH] Revert "bpf: Avoid unnecessary audit log for CPU security + mitigations" + +This reverts commit 236334aeec0f93217cf9235f2004e61a0a1a5985. +--- + include/linux/bpf.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index f0891ba24cb1..61bde4520f5c 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -2164,12 +2164,12 @@ static inline bool bpf_allow_uninit_stack(void) + + static inline bool bpf_bypass_spec_v1(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + static inline bool bpf_bypass_spec_v4(void) + { +- return cpu_mitigations_off() || perfmon_capable(); ++ return perfmon_capable() || cpu_mitigations_off(); + } + + int bpf_map_new_fd(struct bpf_map *map, int flags); +-- +2.34.1 + diff --git a/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch new file mode 100644 index 0000000000000..63bdd28adedd2 --- /dev/null +++ b/ci/diffs/0001-arch-Kconfig-Move-SPECULATION_MITIGATIONS-to-arch-Kc.patch @@ -0,0 +1,69 @@ +From c71766e8ff7a7f950522d25896fba758585500df Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 22 Apr 2024 21:14:40 -0700 +Subject: [PATCH] arch/Kconfig: Move SPECULATION_MITIGATIONS to arch/Kconfig + +SPECULATION_MITIGATIONS is currently defined only for x86. As a result, +IS_ENABLED(CONFIG_SPECULATION_MITIGATIONS) is always false for other +archs. f337a6a21e2f effectively set "mitigations=off" by default on +non-x86 archs, which is not desired behavior. Jakub observed this +change when running bpf selftests on s390 and arm64. + +Fix this by moving SPECULATION_MITIGATIONS to arch/Kconfig so that it is +available in all archs and thus can be used safely in kernel/cpu.c + +Fixes: f337a6a21e2f ("x86/cpu: Actually turn off mitigations by default for SPECULATION_MITIGATIONS=n") +Cc: stable@vger.kernel.org +Cc: Sean Christopherson +Cc: Ingo Molnar +Cc: Daniel Sneddon +Cc: Jakub Kicinski +Signed-off-by: Song Liu +--- + arch/Kconfig | 10 ++++++++++ + arch/x86/Kconfig | 10 ---------- + 2 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 9f066785bb71..8f4af75005f8 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1609,4 +1609,14 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT + # strict alignment always, even with -falign-functions. + def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG + ++menuconfig SPECULATION_MITIGATIONS ++ bool "Mitigations for speculative execution vulnerabilities" ++ default y ++ help ++ Say Y here to enable options which enable mitigations for ++ speculative execution hardware vulnerabilities. ++ ++ If you say N, all mitigations will be disabled. You really ++ should know what you are doing to say so. ++ + endmenu +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 39886bab943a..50c890fce5e0 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2486,16 +2486,6 @@ config PREFIX_SYMBOLS + def_bool y + depends on CALL_PADDING && !CFI_CLANG + +-menuconfig SPECULATION_MITIGATIONS +- bool "Mitigations for speculative execution vulnerabilities" +- default y +- help +- Say Y here to enable options which enable mitigations for +- speculative execution hardware vulnerabilities. +- +- If you say N, all mitigations will be disabled. You really +- should know what you are doing to say so. +- + if SPECULATION_MITIGATIONS + + config MITIGATION_PAGE_TABLE_ISOLATION +-- +2.43.0 + diff --git a/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch new file mode 100644 index 0000000000000..a13d767197413 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-few-selftest-failures-due-to-llvm18-change.patch @@ -0,0 +1,94 @@ +From fb9a697860acd8f54f2ba6647923794378eb33da Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Sun, 26 Nov 2023 21:03:42 -0800 +Subject: [PATCH] bpf: Fix a few selftest failures due to llvm18 change + +With latest upstream llvm18, the following test cases failed: + + $ ./test_progs -j + #13/2 bpf_cookie/multi_kprobe_link_api:FAIL + #13/3 bpf_cookie/multi_kprobe_attach_api:FAIL + #13 bpf_cookie:FAIL + #77 fentry_fexit:FAIL + #78/1 fentry_test/fentry:FAIL + #78 fentry_test:FAIL + #82/1 fexit_test/fexit:FAIL + #82 fexit_test:FAIL + #112/1 kprobe_multi_test/skel_api:FAIL + #112/2 kprobe_multi_test/link_api_addrs:FAIL + [...] + #112 kprobe_multi_test:FAIL + #356/17 test_global_funcs/global_func17:FAIL + #356 test_global_funcs:FAIL + +Further analysis shows llvm upstream patch [1] is responsible for the above +failures. For example, for function bpf_fentry_test7() in net/bpf/test_run.c, +without [1], the asm code is: + + 0000000000000400 : + 400: f3 0f 1e fa endbr64 + 404: e8 00 00 00 00 callq 0x409 + 409: 48 89 f8 movq %rdi, %rax + 40c: c3 retq + 40d: 0f 1f 00 nopl (%rax) + +... and with [1], the asm code is: + + 0000000000005d20 : + 5d20: e8 00 00 00 00 callq 0x5d25 + 5d25: c3 retq + +... and is called instead of +and this caused test failures for #13/#77 etc. except #356. + +For test case #356/17, with [1] (progs/test_global_func17.c)), the main prog +looks like: + + 0000000000000000 : + 0: b4 00 00 00 2a 00 00 00 w0 = 0x2a + 1: 95 00 00 00 00 00 00 00 exit + +... which passed verification while the test itself expects a verification +failure. + +Let us add 'barrier_var' style asm code in both places to prevent function +specialization which caused selftests failure. + + [1] https://github.com/llvm/llvm-project/pull/72903 + +Signed-off-by: Yonghong Song +Signed-off-by: Daniel Borkmann +Link: https://lore.kernel.org/bpf/20231127050342.1945270-1-yonghong.song@linux.dev +--- + net/bpf/test_run.c | 2 +- + tools/testing/selftests/bpf/progs/test_global_func17.c | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c +index c9fdcc5cdce1..711cf5d59816 100644 +--- a/net/bpf/test_run.c ++++ b/net/bpf/test_run.c +@@ -542,7 +542,7 @@ struct bpf_fentry_test_t { + + int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) + { +- asm volatile (""); ++ asm volatile ("": "+r"(arg)); + return (long)arg; + } + +diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c +index a32e11c7d933..5de44b09e8ec 100644 +--- a/tools/testing/selftests/bpf/progs/test_global_func17.c ++++ b/tools/testing/selftests/bpf/progs/test_global_func17.c +@@ -5,6 +5,7 @@ + + __noinline int foo(int *p) + { ++ barrier_var(p); + return p ? (*p = 42) : 0; + } + +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch new file mode 100644 index 0000000000000..5832a42664706 --- /dev/null +++ b/ci/diffs/0001-bpf-Fix-a-verifier-bug-due-to-incorrect-branch-offse.patch @@ -0,0 +1,67 @@ +From dfce9cb3140592b886838e06f3e0c25fea2a9cae Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Thu, 30 Nov 2023 18:46:40 -0800 +Subject: [PATCH 1/1] bpf: Fix a verifier bug due to incorrect branch offset + comparison with cpu=v4 + +Bpf cpu=v4 support is introduced in [1] and Commit 4cd58e9af8b9 +("bpf: Support new 32bit offset jmp instruction") added support for new +32bit offset jmp instruction. Unfortunately, in function +bpf_adj_delta_to_off(), for new branch insn with 32bit offset, the offset +(plus/minor a small delta) compares to 16-bit offset bound +[S16_MIN, S16_MAX], which caused the following verification failure: + $ ./test_progs-cpuv4 -t verif_scale_pyperf180 + ... + insn 10 cannot be patched due to 16-bit range + ... + libbpf: failed to load object 'pyperf180.bpf.o' + scale_test:FAIL:expect_success unexpected error: -12 (errno 12) + #405 verif_scale_pyperf180:FAIL + +Note that due to recent llvm18 development, the patch [2] (already applied +in bpf-next) needs to be applied to bpf tree for testing purpose. + +The fix is rather simple. For 32bit offset branch insn, the adjusted +offset compares to [S32_MIN, S32_MAX] and then verification succeeded. + + [1] https://lore.kernel.org/all/20230728011143.3710005-1-yonghong.song@linux.dev + [2] https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev + +Fixes: 4cd58e9af8b9 ("bpf: Support new 32bit offset jmp instruction") +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/20231201024640.3417057-1-yonghong.song@linux.dev +--- + kernel/bpf/core.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index cd3afe57ece3..fe254ae035fe 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, s32 curr, const bool probe_pass) + { +- const s32 off_min = S16_MIN, off_max = S16_MAX; ++ s64 off_min, off_max, off; + s32 delta = end_new - end_old; +- s32 off; + +- if (insn->code == (BPF_JMP32 | BPF_JA)) ++ if (insn->code == (BPF_JMP32 | BPF_JA)) { + off = insn->imm; +- else ++ off_min = S32_MIN; ++ off_max = S32_MAX; ++ } else { + off = insn->off; ++ off_min = S16_MIN; ++ off_max = S16_MAX; ++ } + + if (curr < pos && curr + off + 1 >= end_old) + off += delta; +-- +2.34.1 + diff --git a/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch new file mode 100644 index 0000000000000..ea6b2386d0345 --- /dev/null +++ b/ci/diffs/0001-bpf-next-selftests-bpf-Fix-a-btf_dump-selftest-failure.patch @@ -0,0 +1,40 @@ +From patchwork Fri Aug 2 18:54:34 2024 +From: Yonghong Song +Subject: [PATCH bpf-next] selftests/bpf: Fix a btf_dump selftest failure + +Jakub reported bpf selftest "btf_dump" failure after forwarding to +v6.11-rc1 with netdev. + Error: #33 btf_dump + Error: #33/15 btf_dump/btf_dump: var_data + btf_dump_data:FAIL:find type id unexpected find type id: actual -2 < expected 0 + +The reason for the failure is due to + commit 94ede2a3e913 ("profiling: remove stale percpu flip buffer variables") +where percpu static variable "cpu_profile_flip" is removed. + +Let us replace "cpu_profile_flip" with a variable in bpf subsystem +so whenever that variable gets deleted or renamed, we can detect the +failure immediately. In this case, I picked a static percpu variable +"bpf_cgrp_storage_busy" which is defined in kernel/bpf/bpf_cgrp_storage.c. + +Reported-by: Jakub Kicinski +Signed-off-by: Yonghong Song +--- + tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +index 09a8e6f9b379..b293b8501fd6 100644 +--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c ++++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c +@@ -805,8 +805,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, + "int cpu_number = (int)100", 100); + #endif +- TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_profile_flip", int, BTF_F_COMPACT, +- "static int cpu_profile_flip = (int)2", 2); ++ TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, ++ "static int bpf_cgrp_storage_busy = (int)2", 2); + } + + static void test_btf_datasec(struct btf *btf, struct btf_dump *d, char *str, diff --git a/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch new file mode 100644 index 0000000000000..bd12bd9b3fba5 --- /dev/null +++ b/ci/diffs/0001-net-bpf-Use-sockopt_lock_sock-in-ip_sock_set_tos.patch @@ -0,0 +1,99 @@ +From c8268f8e9fa33c32e1f2f86fc7b703408a396c70 Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 27 Oct 2023 11:24:24 -0700 +Subject: [PATCH] net: bpf: Use sockopt_lock_sock() in ip_sock_set_tos() + +With latest sync from net-next tree, bpf-next has a bpf selftest failure: + [root@arch-fb-vm1 bpf]# ./test_progs -t setget_sockopt + ... + [ 76.194349] ============================================ + [ 76.194682] WARNING: possible recursive locking detected + [ 76.195039] 6.6.0-rc7-g37884503df08-dirty #67 Tainted: G W OE + [ 76.195518] -------------------------------------------- + [ 76.195852] new_name/154 is trying to acquire lock: + [ 76.196159] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: ip_sock_set_tos+0x19/0x30 + [ 76.196669] + [ 76.196669] but task is already holding lock: + [ 76.197028] ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.197517] + [ 76.197517] other info that might help us debug this: + [ 76.197919] Possible unsafe locking scenario: + [ 76.197919] + [ 76.198287] CPU0 + [ 76.198444] ---- + [ 76.198600] lock(sk_lock-AF_INET); + [ 76.198831] lock(sk_lock-AF_INET); + [ 76.199062] + [ 76.199062] *** DEADLOCK *** + [ 76.199062] + [ 76.199420] May be due to missing lock nesting notation + [ 76.199420] + [ 76.199879] 2 locks held by new_name/154: + [ 76.200131] #0: ffff8c3e06ad8d30 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_listen+0x21/0x70 + [ 76.200644] #1: ffffffff90f96a40 (rcu_read_lock){....}-{1:2}, at: __cgroup_bpf_run_filter_sock_ops+0x55/0x290 + [ 76.201268] + [ 76.201268] stack backtrace: + [ 76.201538] CPU: 4 PID: 154 Comm: new_name Tainted: G W OE 6.6.0-rc7-g37884503df08-dirty #67 + [ 76.202134] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 + [ 76.202699] Call Trace: + [ 76.202858] + [ 76.203002] dump_stack_lvl+0x4b/0x80 + [ 76.203239] __lock_acquire+0x740/0x1ec0 + [ 76.203503] lock_acquire+0xc1/0x2a0 + [ 76.203766] ? ip_sock_set_tos+0x19/0x30 + [ 76.204050] ? sk_stream_write_space+0x12a/0x230 + [ 76.204389] ? lock_release+0xbe/0x260 + [ 76.204661] lock_sock_nested+0x32/0x80 + [ 76.204942] ? ip_sock_set_tos+0x19/0x30 + [ 76.205208] ip_sock_set_tos+0x19/0x30 + [ 76.205452] do_ip_setsockopt+0x4b3/0x1580 + [ 76.205719] __bpf_setsockopt+0x62/0xa0 + [ 76.205963] bpf_sock_ops_setsockopt+0x11/0x20 + [ 76.206247] bpf_prog_630217292049c96e_bpf_test_sockopt_int+0xbc/0x123 + [ 76.206660] bpf_prog_493685a3bae00bbd_bpf_test_ip_sockopt+0x49/0x4b + [ 76.207055] bpf_prog_b0bcd27f269aeea0_skops_sockopt+0x44c/0xec7 + [ 76.207437] __cgroup_bpf_run_filter_sock_ops+0xda/0x290 + [ 76.207829] __inet_listen_sk+0x108/0x1b0 + [ 76.208122] inet_listen+0x48/0x70 + [ 76.208373] __sys_listen+0x74/0xb0 + [ 76.208630] __x64_sys_listen+0x16/0x20 + [ 76.208911] do_syscall_64+0x3f/0x90 + [ 76.209174] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 + ... + +Both ip_sock_set_tos() and inet_listen() calls lock_sock(sk) which +caused a dead lock. + +To fix the issue, use sockopt_lock_sock() in ip_sock_set_tos() +instead. sockopt_lock_sock() will avoid lock_sock() if it is in bpf +context. + +Fixes: 878d951c6712 ("inet: lock the socket in ip_sock_set_tos()") +Suggested-by: Martin KaFai Lau +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/bpf/20231027182424.1444845-1-yonghong.song@linux.dev +--- + net/ipv4/ip_sockglue.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index 9c68b6b74d9f..2efc53526a38 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -602,9 +602,9 @@ void __ip_sock_set_tos(struct sock *sk, int val) + + void ip_sock_set_tos(struct sock *sk, int val) + { +- lock_sock(sk); ++ sockopt_lock_sock(sk); + __ip_sock_set_tos(sk, val); +- release_sock(sk); ++ sockopt_release_sock(sk); + } + EXPORT_SYMBOL(ip_sock_set_tos); + +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch new file mode 100644 index 0000000000000..da5bcdc455967 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Filter-out-_GNU_SOURCE-when-compiling-.patch @@ -0,0 +1,51 @@ +From 41c24102af7b6236277a214428b203d51a3462df Mon Sep 17 00:00:00 2001 +From: Stanislav Fomichev +Date: Thu, 25 Jul 2024 14:40:29 -0700 +Subject: [PATCH 1/1] selftests/bpf: Filter out _GNU_SOURCE when compiling + test_cpp + +Jakub reports build failures when merging linux/master with net tree: + +CXX test_cpp +In file included from :454: +:2:9: error: '_GNU_SOURCE' macro redefined [-Werror,-Wmacro-redefined] + 2 | #define _GNU_SOURCE + | ^ +:445:9: note: previous definition is here + 445 | #define _GNU_SOURCE 1 + +The culprit is commit cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to +CFLAGS in lib.mk") which unconditionally added -D_GNU_SOUCE to CLFAGS. +Apparently clang++ also unconditionally adds it for the C++ targets [0] +which causes a conflict. Add small change in the selftests makefile +to filter it out for test_cpp. + +Not sure which tree it should go via, targeting bpf for now, but net +might be better? + +0: https://stackoverflow.com/questions/11670581/why-is-gnu-source-defined-by-default-and-how-to-turn-it-off + +Signed-off-by: Stanislav Fomichev +Signed-off-by: Andrii Nakryiko +Acked-by: Jiri Olsa +Link: https://lore.kernel.org/bpf/20240725214029.1760809-1-sdf@fomichev.me +--- + tools/testing/selftests/bpf/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index dd49c1d23a60..81d4757ecd4c 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -713,7 +713,7 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + # Make sure we are able to include and link libbpf against c++. + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(CFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch new file mode 100644 index 0000000000000..4ebfe20b24707 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-bpf_cookie-and-find_vma-in-nested-.patch @@ -0,0 +1,50 @@ +From f3d2080e8cf23125f79e345061149ae40f66816f Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Mon, 3 Jun 2024 23:43:17 -0700 +Subject: [PATCH bpf-next] selftests/bpf: Fix bpf_cookie and find_vma in nested + VM + +bpf_cookie and find_vma are flaky in nested VMs, which is used by some CI +systems. It turns out these failures are caused by unreliable perf event +in nested VM. Fix these by: + + 1. Use PERF_COUNT_SW_CPU_CLOCK in find_vma; + 2. Increase sample_freq in bpf_cookie. + +Signed-off-by: Song Liu +--- + tools/testing/selftests/bpf/prog_tests/bpf_cookie.c | 2 +- + tools/testing/selftests/bpf/prog_tests/find_vma.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +index 4407ea428e77..070c52c312e5 100644 +--- a/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c ++++ b/tools/testing/selftests/bpf/prog_tests/bpf_cookie.c +@@ -451,7 +451,7 @@ static void pe_subtest(struct test_bpf_cookie *skel) + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; +- attr.sample_freq = 1000; ++ attr.sample_freq = 10000; + pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC); + if (!ASSERT_GE(pfd, 0, "perf_fd")) + goto cleanup; +diff --git a/tools/testing/selftests/bpf/prog_tests/find_vma.c b/tools/testing/selftests/bpf/prog_tests/find_vma.c +index 5165b38f0e59..f7619e0ade10 100644 +--- a/tools/testing/selftests/bpf/prog_tests/find_vma.c ++++ b/tools/testing/selftests/bpf/prog_tests/find_vma.c +@@ -29,8 +29,8 @@ static int open_pe(void) + + /* create perf event */ + attr.size = sizeof(attr); +- attr.type = PERF_TYPE_HARDWARE; +- attr.config = PERF_COUNT_HW_CPU_CYCLES; ++ attr.type = PERF_TYPE_SOFTWARE; ++ attr.config = PERF_COUNT_SW_CPU_CLOCK; + attr.freq = 1; + attr.sample_freq = 1000; + pfd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch new file mode 100644 index 0000000000000..d55d2e7af8651 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-Fix-pyperf180-compilation-failure-with.patch @@ -0,0 +1,78 @@ +From 100888fb6d8a185866b1520031ee7e3182b173de Mon Sep 17 00:00:00 2001 +From: Yonghong Song +Date: Fri, 10 Nov 2023 11:36:44 -0800 +Subject: [PATCH] selftests/bpf: Fix pyperf180 compilation failure with clang18 + +With latest clang18 (main branch of llvm-project repo), when building bpf selftests, + [~/work/bpf-next (master)]$ make -C tools/testing/selftests/bpf LLVM=1 -j + +The following compilation error happens: + fatal error: error in backend: Branch target out of insn range + ... + Stack dump: + 0. Program arguments: clang -g -Wall -Werror -D__TARGET_ARCH_x86 -mlittle-endian + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf/tools/include + -I/home/yhs/work/bpf-next/tools/testing/selftests/bpf -I/home/yhs/work/bpf-next/tools/include/uapi + -I/home/yhs/work/bpf-next/tools/testing/selftests/usr/include -idirafter + /home/yhs/work/llvm-project/llvm/build.18/install/lib/clang/18/include -idirafter /usr/local/include + -idirafter /usr/include -Wno-compare-distinct-pointer-types -DENABLE_ATOMICS_TESTS -O2 --target=bpf + -c progs/pyperf180.c -mcpu=v3 -o /home/yhs/work/bpf-next/tools/testing/selftests/bpf/pyperf180.bpf.o + 1. parser at end of file + 2. Code generation + ... + +The compilation failure only happens to cpu=v2 and cpu=v3. cpu=v4 is okay +since cpu=v4 supports 32-bit branch target offset. + +The above failure is due to upstream llvm patch [1] where some inlining behavior +are changed in clang18. + +To workaround the issue, previously all 180 loop iterations are fully unrolled. +The bpf macro __BPF_CPU_VERSION__ (implemented in clang18 recently) is used to avoid +unrolling changes if cpu=v4. If __BPF_CPU_VERSION__ is not available and the +compiler is clang18, the unrollng amount is unconditionally reduced. + + [1] https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e + +Signed-off-by: Yonghong Song +Signed-off-by: Andrii Nakryiko +Tested-by: Alan Maguire +Link: https://lore.kernel.org/bpf/20231110193644.3130906-1-yonghong.song@linux.dev +--- + tools/testing/selftests/bpf/progs/pyperf180.c | 22 +++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c +index c39f559d3100..42c4a8b62e36 100644 +--- a/tools/testing/selftests/bpf/progs/pyperf180.c ++++ b/tools/testing/selftests/bpf/progs/pyperf180.c +@@ -1,4 +1,26 @@ + // SPDX-License-Identifier: GPL-2.0 + // Copyright (c) 2019 Facebook + #define STACK_MAX_LEN 180 ++ ++/* llvm upstream commit at clang18 ++ * https://github.com/llvm/llvm-project/commit/1a2e77cf9e11dbf56b5720c607313a566eebb16e ++ * changed inlining behavior and caused compilation failure as some branch ++ * target distance exceeded 16bit representation which is the maximum for ++ * cpu v1/v2/v3. Macro __BPF_CPU_VERSION__ is later implemented in clang18 ++ * to specify which cpu version is used for compilation. So a smaller ++ * unroll_count can be set if __BPF_CPU_VERSION__ is less than 4, which ++ * reduced some branch target distances and resolved the compilation failure. ++ * ++ * To capture the case where a developer/ci uses clang18 but the corresponding ++ * repo checkpoint does not have __BPF_CPU_VERSION__, a smaller unroll_count ++ * will be set as well to prevent potential compilation failures. ++ */ ++#ifdef __BPF_CPU_VERSION__ ++#if __BPF_CPU_VERSION__ < 4 ++#define UNROLL_COUNT 90 ++#endif ++#elif __clang_major__ == 18 ++#define UNROLL_COUNT 90 ++#endif ++ + #include "pyperf.h" +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch new file mode 100644 index 0000000000000..6497a6cc38c90 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-disable-detection-of-llvm-when-buildin.patch @@ -0,0 +1,41 @@ +From 42839864a62ee244ec280b09149b1cb439f681db Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Fri, 27 Oct 2023 18:25:39 -0700 +Subject: [PATCH bpf-next] selftests/bpf: disable detection of llvm when + building bpftool + +The VMs in which we run the selftests do not have llvm installed. +We build selftests/bpftool in a host that have llvm. +bpftool currently will use llvm first and fallback to libbfd but there +is no way to disable detection from the command line. + +Removing it from the feature detection should force us to use libbfd. + +Signed-off-by: Manu Bretelle +--- + tools/bpf/bpftool/Makefile | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile +index e9154ace80ff..01314458e25e 100644 +--- a/tools/bpf/bpftool/Makefile ++++ b/tools/bpf/bpftool/Makefile +@@ -95,7 +95,6 @@ RM ?= rm -f + FEATURE_USER = .bpftool + + FEATURE_TESTS := clang-bpf-co-re +-FEATURE_TESTS += llvm + FEATURE_TESTS += libcap + FEATURE_TESTS += libbfd + FEATURE_TESTS += libbfd-liberty +@@ -104,7 +103,6 @@ FEATURE_TESTS += disassembler-four-args + FEATURE_TESTS += disassembler-init-styled + + FEATURE_DISPLAY := clang-bpf-co-re +-FEATURE_DISPLAY += llvm + FEATURE_DISPLAY += libcap + FEATURE_DISPLAY += libbfd + FEATURE_DISPLAY += libbfd-liberty +-- +2.39.3 + diff --git a/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch new file mode 100644 index 0000000000000..3fa007c51db68 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-fix-inet_csk_accept-prototype-in-test_.patch @@ -0,0 +1,32 @@ +From 0daad0a615e687e1247230f3d0c31ae60ba32314 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Tue, 28 May 2024 15:29:38 -0700 +Subject: [PATCH bpf-next] selftests/bpf: fix inet_csk_accept prototype in + test_sk_storage_tracing.c + +Recent kernel change ([0]) changed inet_csk_accept() prototype. Adapt +progs/test_sk_storage_tracing.c to take that into account. + + [0] 92ef0fd55ac8 ("net: change proto and proto_ops accept type") + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +index 02e718f06e0f..40531e56776e 100644 +--- a/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c ++++ b/tools/testing/selftests/bpf/progs/test_sk_storage_tracing.c +@@ -84,7 +84,7 @@ int BPF_PROG(trace_tcp_connect, struct sock *sk) + } + + SEC("fexit/inet_csk_accept") +-int BPF_PROG(inet_csk_accept, struct sock *sk, int flags, int *err, bool kern, ++int BPF_PROG(inet_csk_accept, struct sock *sk, struct proto_accept_arg *arg, + struct sock *accepted_sk) + { + set_task_info(accepted_sk); +-- +2.43.0 + diff --git a/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch new file mode 100644 index 0000000000000..ec1e29a8ab974 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-work-around-latest-Clang-smartness.patch @@ -0,0 +1,31 @@ +From d31a7125891994681503770cff46a119692fb2b9 Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Mon, 11 Dec 2023 17:09:38 -0800 +Subject: [PATCH 1/1] selftests/bpf: work around latest Clang smartness + +Work around the issue while we deal with it in the Clang itself. +See [0]. + + [0] https://github.com/llvm/llvm-project/pull/73662#issuecomment-1849281758 + +Signed-off-by: Andrii Nakryiko +--- + tools/testing/selftests/bpf/progs/iters.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c +index 3aca3dc145b5..929ba6fa2105 100644 +--- a/tools/testing/selftests/bpf/progs/iters.c ++++ b/tools/testing/selftests/bpf/progs/iters.c +@@ -1420,7 +1420,7 @@ SEC("raw_tp") + __success + int iter_arr_with_actual_elem_count(const void *ctx) + { +- int i, n = loop_data.n, sum = 0; ++ unsigned i, n = loop_data.n, sum = 0; + + if (n > ARRAY_SIZE(loop_data.data)) + return 0; +-- +2.34.1 + diff --git a/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch new file mode 100644 index 0000000000000..e631fac0cc698 --- /dev/null +++ b/ci/diffs/0001-selftests-bpf-xskxceiver-ksft_print_msg-fix-format-t.patch @@ -0,0 +1,89 @@ +From fe69a1b1b6ed9ffc2c578c63f526026a8ab74f0c Mon Sep 17 00:00:00 2001 +From: Anders Roxell +Date: Thu, 9 Nov 2023 18:43:28 +0100 +Subject: [PATCH] selftests: bpf: xskxceiver: ksft_print_msg: fix format type + error + +Crossbuilding selftests/bpf for architecture arm64, format specifies +type error show up like. + +xskxceiver.c:912:34: error: format specifies type 'int' but the argument +has type '__u64' (aka 'unsigned long long') [-Werror,-Wformat] + ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", + ~~ + %llu + __func__, pkt->pkt_nb, meta->count); + ^~~~~~~~~~~ +xskxceiver.c:929:55: error: format specifies type 'unsigned long long' but + the argument has type 'u64' (aka 'unsigned long') [-Werror,-Wformat] + ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); + ~~~~ ^~~~ + +Fixing the issues by casting to (unsigned long long) and changing the +specifiers to be %llu from %d and %u, since with u64s it might be %llx +or %lx, depending on architecture. + +Signed-off-by: Anders Roxell +Link: https://lore.kernel.org/r/20231109174328.1774571-1-anders.roxell@linaro.org +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/xskxceiver.c | 19 ++++++++++++------- + 1 file changed, 12 insertions(+), 7 deletions(-) + +diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c +index 591ca9637b23..b604c570309a 100644 +--- a/tools/testing/selftests/bpf/xskxceiver.c ++++ b/tools/testing/selftests/bpf/xskxceiver.c +@@ -908,8 +908,9 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) + struct xdp_info *meta = data - sizeof(struct xdp_info); + + if (meta->count != pkt->pkt_nb) { +- ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%d]\n", +- __func__, pkt->pkt_nb, meta->count); ++ ksft_print_msg("[%s] expected meta_count [%d], got meta_count [%llu]\n", ++ __func__, pkt->pkt_nb, ++ (unsigned long long)meta->count); + return false; + } + +@@ -926,11 +927,13 @@ static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 exp + + if (addr >= umem->num_frames * umem->frame_size || + addr + len > umem->num_frames * umem->frame_size) { +- ksft_print_msg("Frag invalid addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag invalid addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + if (!umem->unaligned_mode && addr % umem->frame_size + len > umem->frame_size) { +- ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", addr, len); ++ ksft_print_msg("Frag crosses frame boundary addr: %llx len: %u\n", ++ (unsigned long long)addr, len); + return false; + } + +@@ -1029,7 +1032,8 @@ static int complete_pkts(struct xsk_socket_info *xsk, int batch_size) + u64 addr = *xsk_ring_cons__comp_addr(&xsk->umem->cq, idx + rcvd - 1); + + ksft_print_msg("[%s] Too many packets completed\n", __func__); +- ksft_print_msg("Last completion address: %llx\n", addr); ++ ksft_print_msg("Last completion address: %llx\n", ++ (unsigned long long)addr); + return TEST_FAILURE; + } + +@@ -1513,8 +1517,9 @@ static int validate_tx_invalid_descs(struct ifobject *ifobject) + } + + if (stats.tx_invalid_descs != ifobject->xsk->pkt_stream->nb_pkts / 2) { +- ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%u] expected [%u]\n", +- __func__, stats.tx_invalid_descs, ++ ksft_print_msg("[%s] tx_invalid_descs incorrect. Got [%llu] expected [%u]\n", ++ __func__, ++ (unsigned long long)stats.tx_invalid_descs, + ifobject->xsk->pkt_stream->nb_pkts); + return TEST_FAILURE; + } +-- +2.34.1 + diff --git a/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..19d269de7e8ca --- /dev/null +++ b/ci/diffs/0001-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,142 @@ +From 3772e6cdb51f21a11df2acf6aa431cc8b9137bfb Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:09 +0100 +Subject: [PATCH 1/2] tools/resolve_btfids: Refactor set sorting with types + from btf_ids.h + +Instead of using magic offsets to access BTF ID set data, leverage types +from btf_ids.h (btf_id_set and btf_id_set8) which define the actual +layout of the data. Thanks to this change, set sorting should also +continue working if the layout changes. + +This requires to sync the definition of 'struct btf_id_set8' from +include/linux/btf_ids.h to tools/include/linux/btf_ids.h. We don't sync +the rest of the file at the moment, b/c that would require to also sync +multiple dependent headers and we don't need any other defs from +btf_ids.h. + +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Acked-by: Daniel Xu +Link: https://lore.kernel.org/bpf/ff7f062ddf6a00815fda3087957c4ce667f50532.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 ++++++++++++++++++++------------- + tools/include/linux/btf_ids.h | 9 +++++++++ + 2 files changed, 30 insertions(+), 14 deletions(-) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 27a23196d58e..32634f00abba 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -70,6 +70,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -78,7 +79,7 @@ + #include + + #define BTF_IDS_SECTION ".BTF_ids" +-#define BTF_ID "__BTF_ID__" ++#define BTF_ID_PREFIX "__BTF_ID__" + + #define BTF_STRUCT "struct" + #define BTF_UNION "union" +@@ -161,7 +162,7 @@ static int eprintf(int level, int var, const char *fmt, ...) + + static bool is_btf_id(const char *name) + { +- return name && !strncmp(name, BTF_ID, sizeof(BTF_ID) - 1); ++ return name && !strncmp(name, BTF_ID_PREFIX, sizeof(BTF_ID_PREFIX) - 1); + } + + static struct btf_id *btf_id__find(struct rb_root *root, const char *name) +@@ -441,7 +442,7 @@ static int symbols_collect(struct object *obj) + * __BTF_ID__TYPE__vfs_truncate__0 + * prefix = ^ + */ +- prefix = name + sizeof(BTF_ID) - 1; ++ prefix = name + sizeof(BTF_ID_PREFIX) - 1; + + /* struct */ + if (!strncmp(prefix, BTF_STRUCT, sizeof(BTF_STRUCT) - 1)) { +@@ -649,19 +650,18 @@ static int cmp_id(const void *pa, const void *pb) + static int sets_patch(struct object *obj) + { + Elf_Data *data = obj->efile.idlist; +- int *ptr = data->d_buf; + struct rb_node *next; + + next = rb_first(&obj->sets); + while (next) { +- unsigned long addr, idx; ++ struct btf_id_set8 *set8; ++ struct btf_id_set *set; ++ unsigned long addr, off; + struct btf_id *id; +- int *base; +- int cnt; + + id = rb_entry(next, struct btf_id, rb_node); + addr = id->addr[0]; +- idx = addr - obj->efile.idlist_addr; ++ off = addr - obj->efile.idlist_addr; + + /* sets are unique */ + if (id->addr_cnt != 1) { +@@ -670,14 +670,21 @@ static int sets_patch(struct object *obj) + return -1; + } + +- idx = idx / sizeof(int); +- base = &ptr[idx] + (id->is_set8 ? 2 : 1); +- cnt = ptr[idx]; ++ if (id->is_set) { ++ set = data->d_buf + off; ++ qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); ++ } else { ++ set8 = data->d_buf + off; ++ /* ++ * Make sure id is at the beginning of the pairs ++ * struct, otherwise the below qsort would not work. ++ */ ++ BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); ++ qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +- (idx + 1) * sizeof(int), cnt, id->name); +- +- qsort(base, cnt, id->is_set8 ? sizeof(uint64_t) : sizeof(int), cmp_id); ++ off, id->is_set ? set->cnt : set8->cnt, id->name); + + next = rb_next(next); + } +diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h +index 2f882d5cb30f..72535f00572f 100644 +--- a/tools/include/linux/btf_ids.h ++++ b/tools/include/linux/btf_ids.h +@@ -8,6 +8,15 @@ struct btf_id_set { + u32 ids[]; + }; + ++struct btf_id_set8 { ++ u32 cnt; ++ u32 flags; ++ struct { ++ u32 id; ++ u32 flags; ++ } pairs[]; ++}; ++ + #ifdef CONFIG_DEBUG_INFO_BTF + + #include /* for __PASTE */ +-- +2.39.3 + + + diff --git a/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch new file mode 100644 index 0000000000000..24ebc231056cb --- /dev/null +++ b/ci/diffs/0001-tracing-kprobes-Fix-symbol-counting-logic-by-looking.patch @@ -0,0 +1,65 @@ +From 08969a676d234a178ff9f8c67936a2ad98a741eb Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Fri, 27 Oct 2023 16:22:24 -0700 +Subject: [PATCH] tracing/kprobes: Fix symbol counting logic by looking at + modules as well + +Recent changes to count number of matching symbols when creating +a kprobe event failed to take into account kernel modules. As such, it +breaks kprobes on kernel module symbols, by assuming there is no match. + +Fix this my calling module_kallsyms_on_each_symbol() in addition to +kallsyms_on_each_match_symbol() to perform a proper counting. + +Cc: Francis Laniel +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Steven Rostedt +Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols") +Signed-off-by: Andrii Nakryiko +--- + kernel/trace/trace_kprobe.c | 24 ++++++++++++++++++++---- + 1 file changed, 20 insertions(+), 4 deletions(-) + +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index effcaede4759..1efb27f35963 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused) + return 0; + } + ++struct sym_count_ctx { ++ unsigned int count; ++ const char *name; ++}; ++ ++static int count_mod_symbols(void *data, const char *name, unsigned long unused) ++{ ++ struct sym_count_ctx *ctx = data; ++ ++ if (strcmp(name, ctx->name) == 0) ++ ctx->count++; ++ ++ return 0; ++} ++ + static unsigned int number_of_same_symbols(char *func_name) + { +- unsigned int count; ++ struct sym_count_ctx ctx = { .count = 0, .name = func_name }; ++ ++ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + +- count = 0; +- kallsyms_on_each_match_symbol(count_symbols, func_name, &count); ++ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + +- return count; ++ return ctx.count; + } + + static int __trace_kprobe_create(int argc, const char *argv[]) +-- +2.34.1 + diff --git a/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch new file mode 100644 index 0000000000000..c4d67693bd132 --- /dev/null +++ b/ci/diffs/0002-tools-resolve_btfids-fix-cross-compilation-to-non-host-endianness.patch @@ -0,0 +1,117 @@ +From c3dcadfdf2bf8f01471066700c098b5185240df6 Mon Sep 17 00:00:00 2001 +From: Viktor Malik +Date: Tue, 6 Feb 2024 13:46:10 +0100 +Subject: [PATCH 2/2] tools/resolve_btfids: Fix cross-compilation to non-host + endianness + +The .BTF_ids section is pre-filled with zeroed BTF ID entries during the +build and afterwards patched by resolve_btfids with correct values. +Since resolve_btfids always writes in host-native endianness, it relies +on libelf to do the translation when the target ELF is cross-compiled to +a different endianness (this was introduced in commit 61e8aeda9398 +("bpf: Fix libelf endian handling in resolv_btfids")). + +Unfortunately, the translation will corrupt the flags fields of SET8 +entries because these were written during vmlinux compilation and are in +the correct endianness already. This will lead to numerous selftests +failures such as: + + $ sudo ./test_verifier 502 502 + #502/p sleepable fentry accept FAIL + Failed to load prog 'Invalid argument'! + bpf_fentry_test1 is not sleepable + verification time 34 usec + stack depth 0 + processed 0 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 + Summary: 0 PASSED, 0 SKIPPED, 1 FAILED + +Since it's not possible to instruct libelf to translate just certain +values, let's manually bswap the flags (both global and entry flags) in +resolve_btfids when needed, so that libelf then translates everything +correctly. + +Fixes: ef2c6f370a63 ("tools/resolve_btfids: Add support for 8-byte BTF sets") +Signed-off-by: Viktor Malik +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/bpf/7b6bff690919555574ce0f13d2a5996cacf7bf69.1707223196.git.vmalik@redhat.com +--- + tools/bpf/resolve_btfids/main.c | 35 +++++++++++++++++++++++++++++++++ + 1 file changed, 35 insertions(+) + +diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c +index 32634f00abba..d9520cb826b3 100644 +--- a/tools/bpf/resolve_btfids/main.c ++++ b/tools/bpf/resolve_btfids/main.c +@@ -90,6 +90,14 @@ + + #define ADDR_CNT 100 + ++#if __BYTE_ORDER == __LITTLE_ENDIAN ++# define ELFDATANATIVE ELFDATA2LSB ++#elif __BYTE_ORDER == __BIG_ENDIAN ++# define ELFDATANATIVE ELFDATA2MSB ++#else ++# error "Unknown machine endianness!" ++#endif ++ + struct btf_id { + struct rb_node rb_node; + char *name; +@@ -117,6 +125,7 @@ struct object { + int idlist_shndx; + size_t strtabidx; + unsigned long idlist_addr; ++ int encoding; + } efile; + + struct rb_root sets; +@@ -320,6 +329,7 @@ static int elf_collect(struct object *obj) + { + Elf_Scn *scn = NULL; + size_t shdrstrndx; ++ GElf_Ehdr ehdr; + int idx = 0; + Elf *elf; + int fd; +@@ -351,6 +361,13 @@ static int elf_collect(struct object *obj) + return -1; + } + ++ if (gelf_getehdr(obj->efile.elf, &ehdr) == NULL) { ++ pr_err("FAILED cannot get ELF header: %s\n", ++ elf_errmsg(-1)); ++ return -1; ++ } ++ obj->efile.encoding = ehdr.e_ident[EI_DATA]; ++ + /* + * Scan all the elf sections and look for save data + * from .BTF_ids section and symbols. +@@ -681,6 +698,24 @@ static int sets_patch(struct object *obj) + */ + BUILD_BUG_ON(set8->pairs != &set8->pairs[0].id); + qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); ++ ++ /* ++ * When ELF endianness does not match endianness of the ++ * host, libelf will do the translation when updating ++ * the ELF. This, however, corrupts SET8 flags which are ++ * already in the target endianness. So, let's bswap ++ * them to the host endianness and libelf will then ++ * correctly translate everything. ++ */ ++ if (obj->efile.encoding != ELFDATANATIVE) { ++ int i; ++ ++ set8->flags = bswap_32(set8->flags); ++ for (i = 0; i < set8->cnt; i++) { ++ set8->pairs[i].flags = ++ bswap_32(set8->pairs[i].flags); ++ } ++ } + } + + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", +-- +2.39.3 + diff --git a/ci/diffs/0099-s390x_nolockdep.diff b/ci/diffs/0099-s390x_nolockdep.diff new file mode 100644 index 0000000000000..44c2d1a520656 --- /dev/null +++ b/ci/diffs/0099-s390x_nolockdep.diff @@ -0,0 +1,48 @@ +From 470d0c7874ac638ea62cddc3a20ec047fa4ab539 Mon Sep 17 00:00:00 2001 +From: Manu Bretelle +Date: Wed, 14 Feb 2024 17:25:35 -0800 +Subject: [PATCH] bpf/selftests: disable lockdep on s390x + +Tests are slow to run on s390x, this should make them faster. + +Signed-off-by: Manu Bretelle +--- + tools/testing/selftests/bpf/config.s390x | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/bpf/config.s390x b/tools/testing/selftests/bpf/config.s390x +index 706931a8c2c69..67bfd62b0b582 100644 +--- a/tools/testing/selftests/bpf/config.s390x ++++ b/tools/testing/selftests/bpf/config.s390x +@@ -23,11 +23,11 @@ CONFIG_CPUSETS=y + CONFIG_CRASH_DUMP=y + CONFIG_CRYPTO_USER_API_RNG=y + CONFIG_CRYPTO_USER_API_SKCIPHER=y +-CONFIG_DEBUG_ATOMIC_SLEEP=y ++CONFIG_DEBUG_ATOMIC_SLEEP=n + CONFIG_DEBUG_INFO_BTF=y + CONFIG_DEBUG_INFO_DWARF4=y + CONFIG_DEBUG_LIST=y +-CONFIG_DEBUG_LOCKDEP=y ++CONFIG_DEBUG_LOCKDEP=n + CONFIG_DEBUG_NOTIFIERS=y + CONFIG_DEBUG_PAGEALLOC=y + CONFIG_DEBUG_SECTION_MISMATCH=y +@@ -71,7 +71,7 @@ CONFIG_KRETPROBES=y + CONFIG_KSM=y + CONFIG_LATENCYTOP=y + CONFIG_LIVEPATCH=y +-CONFIG_LOCK_STAT=y ++CONFIG_LOCK_STAT=n + CONFIG_MACVLAN=y + CONFIG_MACVTAP=y + CONFIG_MAGIC_SYSRQ=y +@@ -101,7 +101,7 @@ CONFIG_PCI=y + CONFIG_POSIX_MQUEUE=y + CONFIG_PROC_KCORE=y + CONFIG_PROFILING=y +-CONFIG_PROVE_LOCKING=y ++CONFIG_PROVE_LOCKING=n + CONFIG_PTDUMP_DEBUGFS=y + CONFIG_RC_DEVICES=y + CONFIG_RC_LOOPBACK=y diff --git a/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch b/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch new file mode 100644 index 0000000000000..9702960f2b4c3 --- /dev/null +++ b/ci/diffs/0099-scripts-sorttable-Fix-endianness-handling-in-build-t.patch @@ -0,0 +1,41 @@ +From 5f799e4752d441263ae6656062e2fca98f51c6cb Mon Sep 17 00:00:00 2001 +From: Vasily Gorbik +Date: Wed, 2 Apr 2025 03:15:35 +0200 +Subject: [PATCH] scripts/sorttable: Fix endianness handling in build-time + mcount sort + +Kernel cross-compilation with BUILDTIME_MCOUNT_SORT produces zeroed +mcount values if the build-host endianness does not match the ELF +file endianness. + +The mcount values array is converted from ELF file +endianness to build-host endianness during initialization in +fill_relocs()/fill_addrs(). Avoid extra conversion of these values during +weak-function zeroing; otherwise, they do not match nm-parsed addresses +and all mcount values are zeroed out. + +Fixes: ef378c3b8233 ("scripts/sorttable: Zero out weak functions in mcount_loc table") +Reported-by: Ilya Leoshkevich +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/all/your-ad-here.call-01743522822-ext-4975@work.hours/ +Signed-off-by: Vasily Gorbik +--- + scripts/sorttable.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/sorttable.c b/scripts/sorttable.c +index 7b4b3714b1af..deed676bfe38 100644 +--- a/scripts/sorttable.c ++++ b/scripts/sorttable.c +@@ -857,7 +857,7 @@ static void *sort_mcount_loc(void *arg) + for (void *ptr = vals; ptr < vals + size; ptr += long_size) { + uint64_t key; + +- key = long_size == 4 ? r((uint32_t *)ptr) : r8((uint64_t *)ptr); ++ key = long_size == 4 ? *(uint32_t *)ptr : *(uint64_t *)ptr; + if (!find_func(key)) { + if (long_size == 4) + *(uint32_t *)ptr = 0; +-- +2.49.0 + diff --git a/ci/diffs/0099-selftest-cross-compile.diff b/ci/diffs/0099-selftest-cross-compile.diff new file mode 100644 index 0000000000000..e8732596bdb3f --- /dev/null +++ b/ci/diffs/0099-selftest-cross-compile.diff @@ -0,0 +1,13 @@ +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index a38a3001527c..af68528cc944 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -304,7 +304,7 @@ $(OUTPUT)/test_maps: $(TESTING_HELPERS) + $(OUTPUT)/test_verifier: $(TESTING_HELPERS) $(CAP_HELPERS) $(UNPRIV_HELPERS) + $(OUTPUT)/xsk.o: $(BPFOBJ) + +-BPFTOOL ?= $(DEFAULT_BPFTOOL) ++BPFTOOL ?= $(TRUNNER_BPFTOOL) + $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ diff --git a/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch new file mode 100644 index 0000000000000..b81d22a35322c --- /dev/null +++ b/ci/diffs/0199-iov_iter-fix-advancing-slot-in-iter_folioq_get_pages.patch @@ -0,0 +1,46 @@ +From 0d24852bd71ec85ca0016b6d6fc997e6a3381552 Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Mon, 30 Sep 2024 11:55:00 -0700 +Subject: [PATCH] iov_iter: fix advancing slot in iter_folioq_get_pages() + +iter_folioq_get_pages() decides to advance to the next folioq slot when +it has reached the end of the current folio. However, it is checking +offset, which is the beginning of the current part, instead of +iov_offset, which is adjusted to the end of the current part, so it +doesn't advance the slot when it's supposed to. As a result, on the next +iteration, we'll use the same folio with an out-of-bounds offset and +return an unrelated page. + +This manifested as various crashes and other failures in 9pfs in drgn's +VM testing setup and BPF CI. + +Fixes: db0aa2e9566f ("mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios") +Link: https://lore.kernel.org/linux-fsdevel/20240923183432.1876750-1-chantr4@gmail.com/ +Tested-by: Manu Bretelle +Signed-off-by: Omar Sandoval +Link: https://lore.kernel.org/r/cbaf141ba6c0e2e209717d02746584072844841a.1727722269.git.osandov@fb.com +Tested-by: Eduard Zingerman +Tested-by: Leon Romanovsky +Tested-by: Joey Gouly +Acked-by: David Howells +Signed-off-by: Christian Brauner +--- + lib/iov_iter.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/iov_iter.c b/lib/iov_iter.c +index 97003155b..1abb32c0d 100644 +--- a/lib/iov_iter.c ++++ b/lib/iov_iter.c +@@ -1033,7 +1033,7 @@ static ssize_t iter_folioq_get_pages(struct iov_iter *iter, + if (maxpages == 0 || extracted >= maxsize) + break; + +- if (offset >= fsize) { ++ if (iov_offset >= fsize) { + iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { +-- +2.34.1 + diff --git a/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch new file mode 100644 index 0000000000000..11aa3626f5a81 --- /dev/null +++ b/ci/diffs/0299-selftests-bpf-Fix-uprobe-consumer-test.patch @@ -0,0 +1,58 @@ +From affb32e4f056883f285f8535b766293b85752fb4 Mon Sep 17 00:00:00 2001 +From: Jiri Olsa +Date: Tue, 24 Sep 2024 13:07:30 +0200 +Subject: [PATCH] selftests/bpf: Fix uprobe consumer test + +With newly merged code the uprobe behaviour is slightly different +and affects uprobe consumer test. + +We no longer need to check if the uprobe object is still preserved +after removing last uretprobe, because it stays as long as there's +pending/installed uretprobe instance. + +This allows to run uretprobe consumers registered 'after' uprobe was +hit even if previous uretprobe got unregistered before being hit. + +The uprobe object will be now removed after the last uprobe ref is +released and in such case it's held by ri->uprobe (return instance) +which is released after the uretprobe is hit. + +Reported-by: Ihor Solodrai +Signed-off-by: Jiri Olsa +Signed-off-by: Daniel Borkmann +Tested-by: Ihor Solodrai +Closes: https://lore.kernel.org/bpf/w6U8Z9fdhjnkSp2UaFaV1fGqJXvfLEtDKEUyGDkwmoruDJ_AgF_c0FFhrkeKW18OqiP-05s9yDKiT6X-Ns-avN_ABf0dcUkXqbSJN1TQSXo=@pm.me/ +--- + .../testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +index 844f6fc8487b..c1ac813ff9ba 100644 +--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c ++++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +@@ -869,21 +869,14 @@ static void consumer_test(struct uprobe_multi_consumers *skel, + fmt = "prog 0/1: uprobe"; + } else { + /* +- * uprobe return is tricky ;-) +- * + * to trigger uretprobe consumer, the uretprobe needs to be installed, + * which means one of the 'return' uprobes was alive when probe was hit: + * + * idxs: 2/3 uprobe return in 'installed' mask +- * +- * in addition if 'after' state removes everything that was installed in +- * 'before' state, then uprobe kernel object goes away and return uprobe +- * is not installed and we won't hit it even if it's in 'after' state. + */ + unsigned long had_uretprobes = before & 0b1100; /* is uretprobe installed */ +- unsigned long probe_preserved = before & after; /* did uprobe go away */ + +- if (had_uretprobes && probe_preserved && test_bit(idx, after)) ++ if (had_uretprobes && test_bit(idx, after)) + val++; + fmt = "idx 2/3: uretprobe"; + } +-- +2.34.1 + diff --git a/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch new file mode 100644 index 0000000000000..1f95c3c237cb9 --- /dev/null +++ b/ci/diffs/0499-samples-bpf-fix-samples-compilation.patch @@ -0,0 +1,61 @@ +From 80a5958a52b86e39c1a1bf5f4702011c0cf6ab4f Mon Sep 17 00:00:00 2001 +From: Eduard Zingerman +Date: Mon, 2 Dec 2024 12:14:46 -0800 +Subject: [PATCH] samples/bpf: fix samples compilation + +Commit [0] breaks samples build. + +TODO: moar details here + +[0] 13b25489b6f8 ("kbuild: change working directory to external module directory with M=") + +Signed-off-by: Eduard Zingerman +Signed-off-by: Ihor Solodrai +--- + samples/bpf/Makefile | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile +index bcf103a4c14f..ee10dbf1b471 100644 +--- a/samples/bpf/Makefile ++++ b/samples/bpf/Makefile +@@ -146,13 +146,14 @@ ifeq ($(ARCH), x86) + BPF_EXTRA_CFLAGS += -fcf-protection + endif + +-TPROGS_CFLAGS += -Wall -O2 +-TPROGS_CFLAGS += -Wmissing-prototypes +-TPROGS_CFLAGS += -Wstrict-prototypes +-TPROGS_CFLAGS += $(call try-run,\ ++COMMON_CFLAGS += -Wall -O2 ++COMMON_CFLAGS += -Wmissing-prototypes ++COMMON_CFLAGS += -Wstrict-prototypes ++COMMON_CFLAGS += $(call try-run,\ + printf "int main() { return 0; }" |\ + $(CC) -Werror -fsanitize=bounds -x c - -o "$$TMP",-fsanitize=bounds,) + ++TPROGS_CFLAGS += $(COMMON_CFLAGS) + TPROGS_CFLAGS += -I$(objtree)/usr/include + TPROGS_CFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ + TPROGS_CFLAGS += -I$(LIBBPF_INCLUDE) +@@ -229,7 +230,7 @@ clean: + + $(LIBBPF): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT) + # Fix up variables inherited from Kbuild that tools/ build system won't like +- $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \ ++ $(MAKE) -C $(LIBBPF_SRC) RM='rm -rf' EXTRA_CFLAGS="$(COMMON_CFLAGS)" \ + LDFLAGS="$(TPROGS_LDFLAGS)" srctree=$(BPF_SAMPLES_PATH)/../../ \ + O= OUTPUT=$(LIBBPF_OUTPUT)/ DESTDIR=$(LIBBPF_DESTDIR) prefix= \ + $@ install_headers +@@ -305,7 +306,7 @@ $(obj)/$(TRACE_HELPERS): TPROGS_CFLAGS := $(TPROGS_CFLAGS) -D__must_check= + -include $(BPF_SAMPLES_PATH)/Makefile.target + + VMLINUX_BTF_PATHS ?= $(abspath $(if $(O),$(O)/vmlinux)) \ +- $(abspath $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)) \ ++ $(abspath $(if $(objtree),$(objtree)/vmlinux)) \ + $(abspath ./vmlinux) + VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) + +-- +2.47.0 + diff --git a/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch new file mode 100644 index 0000000000000..7e38b1a151a10 --- /dev/null +++ b/ci/diffs/2000-s390-fgraph-Fix-to-remove-ftrace_test_recursion_tryl.patch @@ -0,0 +1,46 @@ +From faf291ff4beaef8dedebd166f11f815cdee257dc Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:37 +0900 +Subject: [PATCH 2000/2001] s390: fgraph: Fix to remove + ftrace_test_recursion_trylock() + +Fix to remove ftrace_test_recursion_trylock() from ftrace_graph_func() +because commit d576aec24df9 ("fgraph: Get ftrace recursion lock in +function_graph_enter") has been moved it to function_graph_enter_regs() +already. + +Reported-by: Jiri Olsa +Closes: https://lore.kernel.org/all/Z5O0shrdgeExZ2kF@krava/ +Fixes: d576aec24df9 ("fgraph: Get ftrace recursion lock in function_graph_enter") +Signed-off-by: Masami Hiramatsu (Google) +Tested-by: Jiri Olsa +--- + arch/s390/kernel/ftrace.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c +index c0b2c97efefb..63ba6306632e 100644 +--- a/arch/s390/kernel/ftrace.c ++++ b/arch/s390/kernel/ftrace.c +@@ -266,18 +266,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) + { + unsigned long *parent = &arch_ftrace_regs(fregs)->regs.gprs[14]; +- int bit; + + if (unlikely(ftrace_graph_is_dead())) + return; + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; +- bit = ftrace_test_recursion_trylock(ip, *parent); +- if (bit < 0) +- return; + if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + *parent = (unsigned long)&return_to_handler; +- ftrace_test_recursion_unlock(bit); + } + + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +-- +2.48.1 + diff --git a/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch new file mode 100644 index 0000000000000..66483206f448f --- /dev/null +++ b/ci/diffs/2001-s390-tracing-Define-ftrace_get_symaddr-for-s390.patch @@ -0,0 +1,28 @@ +From 04fce0d606f59a62105729094013c4784492ec7b Mon Sep 17 00:00:00 2001 +From: "Masami Hiramatsu (Google)" +Date: Wed, 29 Jan 2025 00:29:48 +0900 +Subject: [PATCH 2001/2001] s390: tracing: Define ftrace_get_symaddr() for s390 + +Add ftrace_get_symaddr() for s390, which returns the symbol address +from ftrace's 'ip' parameter. + +Signed-off-by: Masami Hiramatsu (Google) +--- + arch/s390/include/asm/ftrace.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h +index a3b73a4f626e..185331e91f83 100644 +--- a/arch/s390/include/asm/ftrace.h ++++ b/arch/s390/include/asm/ftrace.h +@@ -51,6 +51,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) + { + return addr; + } ++#define ftrace_get_symaddr(fentry_ip) ((unsigned long)(fentry_ip)) + + #include + +-- +2.48.1 + diff --git a/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch new file mode 100644 index 0000000000000..9b24de70e2846 --- /dev/null +++ b/ci/diffs/2001-selftests-bpf-add-fno-strict-aliasing-to-BPF_CFLAGS.patch @@ -0,0 +1,75 @@ +From f44275e7155dc310d36516fc25be503da099781c Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Mon, 6 Jan 2025 20:17:31 +0000 +Subject: [PATCH] selftests/bpf: add -fno-strict-aliasing to BPF_CFLAGS + +Following the discussion at [1], set -fno-strict-aliasing flag for all +BPF object build rules. Remove now unnecessary -CFLAGS variables. + +[1] https://lore.kernel.org/bpf/20250106185447.951609-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Acked-by: Eduard Zingerman +Link: https://lore.kernel.org/r/20250106201728.1219791-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 28 +--------------------------- + 1 file changed, 1 insertion(+), 27 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index eb4d21651aa7..d5be2f94deef 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -54,21 +54,6 @@ PCAP_LIBS := $(shell $(PKG_CONFIG) --libs libpcap 2>/dev/null) + LDLIBS += $(PCAP_LIBS) + CFLAGS += $(PCAP_CFLAGS) + +-# The following tests perform type punning and they may break strict +-# aliasing rules, which are exploited by both GCC and clang by default +-# while optimizing. This can lead to broken programs. +-progs/bind4_prog.c-CFLAGS := -fno-strict-aliasing +-progs/bind6_prog.c-CFLAGS := -fno-strict-aliasing +-progs/dynptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/linked_list_fail.c-CFLAGS := -fno-strict-aliasing +-progs/map_kptr_fail.c-CFLAGS := -fno-strict-aliasing +-progs/syscall.c-CFLAGS := -fno-strict-aliasing +-progs/test_pkt_md_access.c-CFLAGS := -fno-strict-aliasing +-progs/test_sk_lookup.c-CFLAGS := -fno-strict-aliasing +-progs/timer_crash.c-CFLAGS := -fno-strict-aliasing +-progs/test_global_func9.c-CFLAGS := -fno-strict-aliasing +-progs/verifier_nocsr.c-CFLAGS := -fno-strict-aliasing +- + # Some utility functions use LLVM libraries + jit_disasm_helpers.c-CFLAGS = $(LLVM_CFLAGS) + +@@ -103,18 +88,6 @@ progs/btf_dump_test_case_packing.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_padding.c-bpf_gcc-CFLAGS := -Wno-error + progs/btf_dump_test_case_syntax.c-bpf_gcc-CFLAGS := -Wno-error + +-# The following tests do type-punning, via the __imm_insn macro, from +-# `struct bpf_insn' to long and then uses the value. This triggers an +-# "is used uninitialized" warning in GCC due to strict-aliasing +-# rules. +-progs/verifier_ref_tracking.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_unpriv.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_cgroup_storage.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_ld_ind.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_map_ret_val.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_spill_fill.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_subprog_precision.c-bpf_gcc-CFLAGS := -fno-strict-aliasing +-progs/verifier_uninit.c-bpf_gcc-CFLAGS := -fno-strict-aliasing + endif + + ifneq ($(CLANG_CPUV4),) +@@ -474,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare + +-- +2.47.1 + diff --git a/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch new file mode 100644 index 0000000000000..127b2641dc223 --- /dev/null +++ b/ci/diffs/2002-selftests-bpf-add-std-gnu11-to-BPF_CFLAGS-and-CFLAGS.patch @@ -0,0 +1,63 @@ +From bab18c7db44d3aa6c84450095451580922359c7a Mon Sep 17 00:00:00 2001 +From: Ihor Solodrai +Date: Tue, 7 Jan 2025 23:58:18 +0000 +Subject: [PATCH] selftests/bpf: add -std=gnu11 to BPF_CFLAGS and CFLAGS + +Latest versions of GCC BPF use C23 standard by default. This causes +compilation errors in vmlinux.h due to bool types declarations. + +Add -std=gnu11 to BPF_CFLAGS and CFLAGS. This aligns with the version +of the standard used when building the kernel currently [1]. + +For more details see the discussions at [2] and [3]. + +[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Makefile#n465 +[2] https://lore.kernel.org/bpf/EYcXjcKDCJY7Yb0GGtAAb7nLKPEvrgWdvWpuNzXm2qi6rYMZDixKv5KwfVVMBq17V55xyC-A1wIjrqG3aw-Imqudo9q9X7D7nLU2gWgbN0w=@pm.me/ +[3] https://lore.kernel.org/bpf/20250106202715.1232864-1-ihor.solodrai@pm.me/ + +CC: Jose E. Marchesi +Signed-off-by: Ihor Solodrai +Link: https://lore.kernel.org/r/20250107235813.2964472-1-ihor.solodrai@pm.me +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/Makefile | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile +index d5be2f94deef..ea9cee5de0f8 100644 +--- a/tools/testing/selftests/bpf/Makefile ++++ b/tools/testing/selftests/bpf/Makefile +@@ -41,7 +41,7 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) + srctree := $(patsubst %/,%,$(dir $(srctree))) + endif + +-CFLAGS += -g $(OPT_FLAGS) -rdynamic \ ++CFLAGS += -g $(OPT_FLAGS) -rdynamic -std=gnu11 \ + -Wall -Werror -fno-omit-frame-pointer \ + $(GENFLAGS) $(SAN_CFLAGS) $(LIBELF_CFLAGS) \ + -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ +@@ -447,6 +447,7 @@ CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH)) + BPF_CFLAGS = -g -Wall -Werror -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \ + -I$(abspath $(OUTPUT)/../usr/include) \ ++ -std=gnu11 \ + -fno-strict-aliasing \ + -Wno-compare-distinct-pointer-types + # TODO: enable me -Wsign-compare +@@ -787,9 +788,12 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp + $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@ + + # Make sure we are able to include and link libbpf against c++. ++CXXFLAGS += $(CFLAGS) ++CXXFLAGS := $(subst -D_GNU_SOURCE=,,$(CXXFLAGS)) ++CXXFLAGS := $(subst -std=gnu11,-std=gnu++11,$(CXXFLAGS)) + $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ) + $(call msg,CXX,,$@) +- $(Q)$(CXX) $(subst -D_GNU_SOURCE=,,$(CFLAGS)) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ ++ $(Q)$(CXX) $(CXXFLAGS) $(filter %.a %.o %.cpp,$^) $(LDLIBS) -o $@ + + # Benchmark runner + $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h $(BPFOBJ) +-- +2.47.1 + diff --git a/ci/diffs/20250611-veth-prevent-NULL-pointer-dereference-in-veth_xdp_rc.patch b/ci/diffs/20250611-veth-prevent-NULL-pointer-dereference-in-veth_xdp_rc.patch new file mode 100644 index 0000000000000..28a3daaf5991c --- /dev/null +++ b/ci/diffs/20250611-veth-prevent-NULL-pointer-dereference-in-veth_xdp_rc.patch @@ -0,0 +1,52 @@ +From 5bfcfb00e0bd7314a9b86b55e1c7e009aa70111e Mon Sep 17 00:00:00 2001 +From: Jesper Dangaard Brouer +Date: Wed, 11 Jun 2025 14:40:04 +0200 +Subject: [PATCH] veth: prevent NULL pointer dereference in veth_xdp_rcv + +The veth peer device is RCU protected, but when the peer device gets +deleted (veth_dellink) then the pointer is assigned NULL (via +RCU_INIT_POINTER). + +This patch adds a necessary NULL check in veth_xdp_rcv when accessing +the veth peer net_device. + +This fixes a bug introduced in commit dc82a33297fc ("veth: apply qdisc +backpressure on full ptr_ring to reduce TX drops"). The bug is a race +and only triggers when having inflight packets on a veth that is being +deleted. + +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/all/fecfcad0-7a16-42b8-bff2-66ee83a6e5c4@linux.dev/ +Reported-by: syzbot+c4c7bf27f6b0c4bd97fe@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/all/683da55e.a00a0220.d8eae.0052.GAE@google.com/ +Fixes: dc82a33297fc ("veth: apply qdisc backpressure on full ptr_ring to reduce TX drops") +Signed-off-by: Jesper Dangaard Brouer +--- + drivers/net/veth.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/veth.c b/drivers/net/veth.c +index e58a0f1b5c5b..a3046142cb8e 100644 +--- a/drivers/net/veth.c ++++ b/drivers/net/veth.c +@@ -909,7 +909,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, + + /* NAPI functions as RCU section */ + peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); +- peer_txq = netdev_get_tx_queue(peer_dev, queue_idx); ++ peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; + + for (i = 0; i < budget; i++) { + void *ptr = __ptr_ring_consume(&rq->xdp_ring); +@@ -959,7 +959,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, + rq->stats.vs.xdp_packets += done; + u64_stats_update_end(&rq->stats.syncp); + +- if (unlikely(netif_tx_queue_stopped(peer_txq))) ++ if (peer_txq && unlikely(netif_tx_queue_stopped(peer_txq))) + netif_tx_wake_queue(peer_txq); + + return done; +-- +2.49.0 + diff --git a/ci/diffs/20250627-elftests-bpf-Fix-cgroup_xattr-read_cgroupfs_xattr.patch b/ci/diffs/20250627-elftests-bpf-Fix-cgroup_xattr-read_cgroupfs_xattr.patch new file mode 100644 index 0000000000000..4bb447b04c5bc --- /dev/null +++ b/ci/diffs/20250627-elftests-bpf-Fix-cgroup_xattr-read_cgroupfs_xattr.patch @@ -0,0 +1,262 @@ +From 9076f6b1ee16b1c5435a1c574a28fd115f8e7022 Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Fri, 27 Jun 2025 12:12:21 -0700 +Subject: [PATCH] selftests/bpf: Fix cgroup_xattr/read_cgroupfs_xattr + +cgroup_xattr/read_cgroupfs_xattr has two issues: + +1. cgroup_xattr/read_cgroupfs_xattr messes up lo without creating a netns + first. This causes issue with other tests. + + Fix this by using a different hook (lsm.s/file_open) and not messing + with lo. + +2. cgroup_xattr/read_cgroupfs_xattr sets up cgroups without proper + mount namespaces. + + Fix this by using the existing cgroup helpers. A new helper + set_cgroup_xattr() is added to set xattr on cgroup files. + +Fixes: f4fba2d6d282 ("selftests/bpf: Add tests for bpf_cgroup_read_xattr") +Reported-by: Alexei Starovoitov +Closes: https://lore.kernel.org/bpf/CAADnVQ+iqMi2HEj_iH7hsx+XJAsqaMWqSDe4tzcGAnehFWA9Sw@mail.gmail.com/ +Signed-off-by: Song Liu +--- + tools/testing/selftests/bpf/cgroup_helpers.c | 21 ++++ + tools/testing/selftests/bpf/cgroup_helpers.h | 4 + + .../selftests/bpf/prog_tests/cgroup_xattr.c | 117 ++++-------------- + .../selftests/bpf/progs/read_cgroupfs_xattr.c | 4 +- + 4 files changed, 49 insertions(+), 97 deletions(-) + +diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c +index e4535451322e..15f626014872 100644 +--- a/tools/testing/selftests/bpf/cgroup_helpers.c ++++ b/tools/testing/selftests/bpf/cgroup_helpers.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -318,6 +319,26 @@ int join_parent_cgroup(const char *relative_path) + return join_cgroup_from_top(cgroup_path); + } + ++/** ++ * set_cgroup_xattr() - Set xattr on a cgroup dir ++ * @relative_path: The cgroup path, relative to the workdir, to set xattr ++ * @name: xattr name ++ * @value: xattr value ++ * ++ * This function set xattr on cgroup dir. ++ * ++ * On success, it returns 0, otherwise on failure it returns -1. ++ */ ++int set_cgroup_xattr(const char *relative_path, ++ const char *name, ++ const char *value) ++{ ++ char cgroup_path[PATH_MAX + 1]; ++ ++ format_cgroup_path(cgroup_path, relative_path); ++ return setxattr(cgroup_path, name, value, strlen(value) + 1, 0); ++} ++ + /** + * __cleanup_cgroup_environment() - Delete temporary cgroups + * +diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h +index 502845160d88..182e1ac36c95 100644 +--- a/tools/testing/selftests/bpf/cgroup_helpers.h ++++ b/tools/testing/selftests/bpf/cgroup_helpers.h +@@ -26,6 +26,10 @@ int join_cgroup(const char *relative_path); + int join_root_cgroup(void); + int join_parent_cgroup(const char *relative_path); + ++int set_cgroup_xattr(const char *relative_path, ++ const char *name, ++ const char *value); ++ + int setup_cgroup_environment(void); + void cleanup_cgroup_environment(void); + +diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c +index 87978a0f7eb7..e0dd966e4a3e 100644 +--- a/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c ++++ b/tools/testing/selftests/bpf/prog_tests/cgroup_xattr.c +@@ -7,133 +7,60 @@ + #include + #include + #include +-#include +- + #include ++#include "cgroup_helpers.h" + + #include "read_cgroupfs_xattr.skel.h" + #include "cgroup_read_xattr.skel.h" + +-#define CGROUP_FS_ROOT "/sys/fs/cgroup/" +-#define CGROUP_FS_PARENT CGROUP_FS_ROOT "foo/" ++#define CGROUP_FS_PARENT "foo/" + #define CGROUP_FS_CHILD CGROUP_FS_PARENT "bar/" +- +-static int move_pid_to_cgroup(const char *cgroup_folder, pid_t pid) +-{ +- char filename[128]; +- char pid_str[64]; +- int procs_fd; +- int ret; +- +- snprintf(filename, sizeof(filename), "%scgroup.procs", cgroup_folder); +- snprintf(pid_str, sizeof(pid_str), "%d", pid); +- +- procs_fd = open(filename, O_WRONLY | O_APPEND); +- if (!ASSERT_OK_FD(procs_fd, "open")) +- return -1; +- +- ret = write(procs_fd, pid_str, strlen(pid_str)); +- close(procs_fd); +- if (!ASSERT_GT(ret, 0, "write cgroup.procs")) +- return -1; +- return 0; +-} +- +-static void reset_cgroups_and_lo(void) +-{ +- rmdir(CGROUP_FS_CHILD); +- rmdir(CGROUP_FS_PARENT); +- system("ip addr del 1.1.1.1/32 dev lo"); +- system("ip link set dev lo down"); +-} ++#define TMP_FILE "/tmp/selftests_cgroup_xattr" + + static const char xattr_value_a[] = "bpf_selftest_value_a"; + static const char xattr_value_b[] = "bpf_selftest_value_b"; + static const char xattr_name[] = "user.bpf_test"; + +-static int setup_cgroups_and_lo(void) +-{ +- int err; +- +- err = mkdir(CGROUP_FS_PARENT, 0755); +- if (!ASSERT_OK(err, "mkdir 1")) +- goto error; +- err = mkdir(CGROUP_FS_CHILD, 0755); +- if (!ASSERT_OK(err, "mkdir 2")) +- goto error; +- +- err = setxattr(CGROUP_FS_PARENT, xattr_name, xattr_value_a, +- strlen(xattr_value_a) + 1, 0); +- if (!ASSERT_OK(err, "setxattr 1")) +- goto error; +- +- err = setxattr(CGROUP_FS_CHILD, xattr_name, xattr_value_b, +- strlen(xattr_value_b) + 1, 0); +- if (!ASSERT_OK(err, "setxattr 2")) +- goto error; +- +- err = system("ip link set dev lo up"); +- if (!ASSERT_OK(err, "lo up")) +- goto error; +- +- err = system("ip addr add 1.1.1.1 dev lo"); +- if (!ASSERT_OK(err, "lo addr v4")) +- goto error; +- +- err = write_sysctl("/proc/sys/net/ipv4/ping_group_range", "0 0"); +- if (!ASSERT_OK(err, "write_sysctl")) +- goto error; +- +- return 0; +-error: +- reset_cgroups_and_lo(); +- return err; +-} +- + static void test_read_cgroup_xattr(void) + { +- struct sockaddr_in sa4 = { +- .sin_family = AF_INET, +- .sin_addr.s_addr = htonl(INADDR_LOOPBACK), +- }; ++ int tmp_fd, parent_cgroup_fd = -1, child_cgroup_fd = -1; + struct read_cgroupfs_xattr *skel = NULL; +- pid_t pid = gettid(); +- int sock_fd = -1; +- int connect_fd = -1; + +- if (!ASSERT_OK(setup_cgroups_and_lo(), "setup_cgroups_and_lo")) ++ parent_cgroup_fd = test__join_cgroup(CGROUP_FS_PARENT); ++ if (!ASSERT_OK_FD(parent_cgroup_fd, "create parent cgroup")) + return; +- if (!ASSERT_OK(move_pid_to_cgroup(CGROUP_FS_CHILD, pid), +- "move_pid_to_cgroup")) ++ if (!ASSERT_OK(set_cgroup_xattr(CGROUP_FS_PARENT, xattr_name, xattr_value_a), ++ "set parent xattr")) ++ goto out; ++ ++ child_cgroup_fd = test__join_cgroup(CGROUP_FS_CHILD); ++ if (!ASSERT_OK_FD(child_cgroup_fd, "create child cgroup")) ++ goto out; ++ if (!ASSERT_OK(set_cgroup_xattr(CGROUP_FS_CHILD, xattr_name, xattr_value_b), ++ "set child xattr")) + goto out; + + skel = read_cgroupfs_xattr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "read_cgroupfs_xattr__open_and_load")) + goto out; + +- skel->bss->target_pid = pid; ++ skel->bss->target_pid = gettid(); + + if (!ASSERT_OK(read_cgroupfs_xattr__attach(skel), "read_cgroupfs_xattr__attach")) + goto out; + +- sock_fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP); +- if (!ASSERT_OK_FD(sock_fd, "sock create")) +- goto out; +- +- connect_fd = connect(sock_fd, &sa4, sizeof(sa4)); +- if (!ASSERT_OK_FD(connect_fd, "connect 1")) +- goto out; +- close(connect_fd); ++ tmp_fd = open(TMP_FILE, O_RDONLY | O_CREAT); ++ ASSERT_OK_FD(tmp_fd, "open tmp file"); ++ close(tmp_fd); + + ASSERT_TRUE(skel->bss->found_value_a, "found_value_a"); + ASSERT_TRUE(skel->bss->found_value_b, "found_value_b"); + + out: +- close(connect_fd); +- close(sock_fd); ++ close(child_cgroup_fd); ++ close(parent_cgroup_fd); + read_cgroupfs_xattr__destroy(skel); +- move_pid_to_cgroup(CGROUP_FS_ROOT, pid); +- reset_cgroups_and_lo(); ++ unlink(TMP_FILE); + } + + void test_cgroup_xattr(void) +diff --git a/tools/testing/selftests/bpf/progs/read_cgroupfs_xattr.c b/tools/testing/selftests/bpf/progs/read_cgroupfs_xattr.c +index 855f85fc5522..405adbe5e8b0 100644 +--- a/tools/testing/selftests/bpf/progs/read_cgroupfs_xattr.c ++++ b/tools/testing/selftests/bpf/progs/read_cgroupfs_xattr.c +@@ -17,8 +17,8 @@ static const char expected_value_b[] = "bpf_selftest_value_b"; + bool found_value_a; + bool found_value_b; + +-SEC("lsm.s/socket_connect") +-int BPF_PROG(test_socket_connect) ++SEC("lsm.s/file_open") ++int BPF_PROG(test_file_open) + { + u64 cgrp_id = bpf_get_current_cgroup_id(); + struct cgroup_subsys_state *css, *tmp; +-- +2.50.0 + diff --git a/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch b/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch new file mode 100644 index 0000000000000..1c078304d58b3 --- /dev/null +++ b/ci/diffs/3001-selftests-bpf-Fix-stdout-race-condition-in-traffic-m.patch @@ -0,0 +1,112 @@ +From b99f27e90268b1a814c13f8bd72ea1db448ea257 Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Thu, 13 Feb 2025 15:32:17 -0800 +Subject: [PATCH] selftests/bpf: Fix stdout race condition in traffic monitor + +Fix a race condition between the main test_progs thread and the traffic +monitoring thread. The traffic monitor thread tries to print a line +using multiple printf and use flockfile() to prevent the line from being +torn apart. Meanwhile, the main thread doing io redirection can reassign +or close stdout when going through tests. A deadlock as shown below can +happen. + + main traffic_monitor_thread + ==== ====================== + show_transport() + -> flockfile(stdout) + +stdio_hijack_init() +-> stdout = open_memstream(log_buf, log_cnt); + ... + env.subtest_state->stdout_saved = stdout; + + ... + funlockfile(stdout) +stdio_restore_cleanup() +-> fclose(env.subtest_state->stdout_saved); + +After the traffic monitor thread lock stdout, A new memstream can be +assigned to stdout by the main thread. Therefore, the traffic monitor +thread later will not be able to unlock the original stdout. As the +main thread tries to access the old stdout, it will hang indefinitely +as it is still locked by the traffic monitor thread. + +The deadlock can be reproduced by running test_progs repeatedly with +traffic monitor enabled: + +for ((i=1;i<=100;i++)); do + ./test_progs -a flow_dissector_skb* -m '*' +done + +Fix this by only calling printf once and remove flockfile()/funlockfile(). + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Link: https://patch.msgid.link/20250213233217.553258-1-ameryhung@gmail.com +--- + tools/testing/selftests/bpf/network_helpers.c | 33 ++++++++----------- + 1 file changed, 13 insertions(+), 20 deletions(-) + +diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c +index a4252e000428..737a952dcf80 100644 +--- a/tools/testing/selftests/bpf/network_helpers.c ++++ b/tools/testing/selftests/bpf/network_helpers.c +@@ -788,12 +788,13 @@ static const char *pkt_type_str(u16 pkt_type) + return "Unknown"; + } + ++#define MAX_FLAGS_STRLEN 21 + /* Show the information of the transport layer in the packet */ + static void show_transport(const u_char *packet, u16 len, u32 ifindex, + const char *src_addr, const char *dst_addr, + u16 proto, bool ipv6, u8 pkt_type) + { +- char *ifname, _ifname[IF_NAMESIZE]; ++ char *ifname, _ifname[IF_NAMESIZE], flags[MAX_FLAGS_STRLEN] = ""; + const char *transport_str; + u16 src_port, dst_port; + struct udphdr *udp; +@@ -834,29 +835,21 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + + /* TCP or UDP*/ + +- flockfile(stdout); ++ if (proto == IPPROTO_TCP) ++ snprintf(flags, MAX_FLAGS_STRLEN, "%s%s%s%s", ++ tcp->fin ? ", FIN" : "", ++ tcp->syn ? ", SYN" : "", ++ tcp->rst ? ", RST" : "", ++ tcp->ack ? ", ACK" : ""); ++ + if (ipv6) +- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d", ++ printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len); ++ dst_addr, dst_port, transport_str, len, flags); + else +- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d", ++ printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len); +- +- if (proto == IPPROTO_TCP) { +- if (tcp->fin) +- printf(", FIN"); +- if (tcp->syn) +- printf(", SYN"); +- if (tcp->rst) +- printf(", RST"); +- if (tcp->ack) +- printf(", ACK"); +- } +- +- printf("\n"); +- funlockfile(stdout); ++ dst_addr, dst_port, transport_str, len, flags); + } + + static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +-- +2.48.1 + diff --git a/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch b/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch new file mode 100644 index 0000000000000..3c0ad2dee6c0e --- /dev/null +++ b/ci/diffs/3002-selftests-bpf-Clean-up-call-sites-of-stdio_restore.patch @@ -0,0 +1,106 @@ +From aeb5bbb0253892f601702aee9fc00038824a1c59 Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:55 -0800 +Subject: [PATCH] selftests/bpf: Clean up call sites of stdio_restore() + +reset_affinity() and save_ns() are only called in run_one_test(). There is +no need to call stdio_restore() in reset_affinity() and save_ns() if +stdio_restore() is moved right after a test finishes in run_one_test(). + +Also remove an unnecessary check of env.stdout_saved in crash_handler() +by moving env.stdout_saved assignment to the beginning of main(). + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Acked-by: Eduard Zingerman +Link: https://patch.msgid.link/20250305182057.2802606-1-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/test_progs.c | 17 ++++++----------- + 1 file changed, 6 insertions(+), 11 deletions(-) + +diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c +index 0cb759632225..ab0f2fed3c58 100644 +--- a/tools/testing/selftests/bpf/test_progs.c ++++ b/tools/testing/selftests/bpf/test_progs.c +@@ -474,8 +474,6 @@ static void dump_test_log(const struct prog_test_def *test, + print_test_result(test, test_state); + } + +-static void stdio_restore(void); +- + /* A bunch of tests set custom affinity per-thread and/or per-process. Reset + * it after each test/sub-test. + */ +@@ -490,13 +488,11 @@ static void reset_affinity(void) + + err = sched_setaffinity(0, sizeof(cpuset), &cpuset); + if (err < 0) { +- stdio_restore(); + fprintf(stderr, "Failed to reset process affinity: %d!\n", err); + exit(EXIT_ERR_SETUP_INFRA); + } + err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + if (err < 0) { +- stdio_restore(); + fprintf(stderr, "Failed to reset thread affinity: %d!\n", err); + exit(EXIT_ERR_SETUP_INFRA); + } +@@ -514,7 +510,6 @@ static void save_netns(void) + static void restore_netns(void) + { + if (setns(env.saved_netns_fd, CLONE_NEWNET) == -1) { +- stdio_restore(); + perror("setns(CLONE_NEWNS)"); + exit(EXIT_ERR_SETUP_INFRA); + } +@@ -1270,8 +1265,7 @@ void crash_handler(int signum) + + sz = backtrace(bt, ARRAY_SIZE(bt)); + +- if (env.stdout_saved) +- stdio_restore(); ++ stdio_restore(); + if (env.test) { + env.test_state->error_cnt++; + dump_test_log(env.test, env.test_state, true, false, NULL); +@@ -1400,6 +1394,8 @@ static void run_one_test(int test_num) + + state->tested = true; + ++ stdio_restore(); ++ + if (verbose() && env.worker_id == -1) + print_test_result(test, state); + +@@ -1408,7 +1404,6 @@ static void run_one_test(int test_num) + if (test->need_cgroup_cleanup) + cleanup_cgroup_environment(); + +- stdio_restore(); + free(stop_libbpf_log_capture()); + + dump_test_log(test, state, false, false, NULL); +@@ -1943,6 +1938,9 @@ int main(int argc, char **argv) + + sigaction(SIGSEGV, &sigact, NULL); + ++ env.stdout_saved = stdout; ++ env.stderr_saved = stderr; ++ + env.secs_till_notify = 10; + env.secs_till_kill = 120; + err = argp_parse(&argp, argc, argv, 0, NULL, &env); +@@ -1969,9 +1967,6 @@ int main(int argc, char **argv) + return -1; + } + +- env.stdout_saved = stdout; +- env.stderr_saved = stderr; +- + env.has_testmod = true; + if (!env.list_test_names) { + /* ensure previous instance of the module is unloaded */ +-- +2.48.1 + diff --git a/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch b/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch new file mode 100644 index 0000000000000..746d1d6210b1f --- /dev/null +++ b/ci/diffs/3003-selftests-bpf-Allow-assigning-traffic-monitor-print-.patch @@ -0,0 +1,179 @@ +From 34a25aabcdea5c6e42a283381f8354d70592744a Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:56 -0800 +Subject: [PATCH] selftests/bpf: Allow assigning traffic monitor print function + +Allow users to change traffic monitor's print function. If not provided, +traffic monitor will print to stdout by default. + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Link: https://patch.msgid.link/20250305182057.2802606-2-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/network_helpers.c | 69 ++++++++++++++----- + tools/testing/selftests/bpf/network_helpers.h | 9 +++ + 2 files changed, 59 insertions(+), 19 deletions(-) + +diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c +index 737a952dcf80..97fb7caf95f4 100644 +--- a/tools/testing/selftests/bpf/network_helpers.c ++++ b/tools/testing/selftests/bpf/network_helpers.c +@@ -750,6 +750,36 @@ struct tmonitor_ctx { + int pcap_fd; + }; + ++static int __base_pr(const char *format, va_list args) ++{ ++ return vfprintf(stdout, format, args); ++} ++ ++static tm_print_fn_t __tm_pr = __base_pr; ++ ++tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) ++{ ++ tm_print_fn_t old_print_fn; ++ ++ old_print_fn = __atomic_exchange_n(&__tm_pr, fn, __ATOMIC_RELAXED); ++ ++ return old_print_fn; ++} ++ ++void tm_print(const char *format, ...) ++{ ++ tm_print_fn_t print_fn; ++ va_list args; ++ ++ print_fn = __atomic_load_n(&__tm_pr, __ATOMIC_RELAXED); ++ if (!print_fn) ++ return; ++ ++ va_start(args, format); ++ print_fn(format, args); ++ va_end(args); ++} ++ + /* Is this packet captured with a Ethernet protocol type? */ + static bool is_ethernet(const u_char *packet) + { +@@ -767,7 +797,7 @@ static bool is_ethernet(const u_char *packet) + case 770: /* ARPHRD_FRAD */ + case 778: /* ARPHDR_IPGRE */ + case 803: /* ARPHRD_IEEE80211_RADIOTAP */ +- printf("Packet captured: arphdr_type=%d\n", arphdr_type); ++ tm_print("Packet captured: arphdr_type=%d\n", arphdr_type); + return false; + } + return true; +@@ -817,19 +847,19 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + dst_port = ntohs(tcp->dest); + transport_str = "TCP"; + } else if (proto == IPPROTO_ICMP) { +- printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", +- ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, +- packet[0], packet[1]); ++ tm_print("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", ++ ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, ++ packet[0], packet[1]); + return; + } else if (proto == IPPROTO_ICMPV6) { +- printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", +- ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, +- packet[0], packet[1]); ++ tm_print("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", ++ ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, ++ packet[0], packet[1]); + return; + } else { +- printf("%-7s %-3s %s %s > %s: protocol %d\n", +- ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", +- src_addr, dst_addr, proto); ++ tm_print("%-7s %-3s %s %s > %s: protocol %d\n", ++ ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", ++ src_addr, dst_addr, proto); + return; + } + +@@ -843,13 +873,13 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, + tcp->ack ? ", ACK" : ""); + + if (ipv6) +- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", +- ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len, flags); ++ tm_print("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", ++ ifname, pkt_type_str(pkt_type), src_addr, src_port, ++ dst_addr, dst_port, transport_str, len, flags); + else +- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", +- ifname, pkt_type_str(pkt_type), src_addr, src_port, +- dst_addr, dst_port, transport_str, len, flags); ++ tm_print("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", ++ ifname, pkt_type_str(pkt_type), src_addr, src_port, ++ dst_addr, dst_port, transport_str, len, flags); + } + + static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) +@@ -964,8 +994,8 @@ static void *traffic_monitor_thread(void *arg) + ifname = _ifname; + } + +- printf("%-7s %-3s Unknown network protocol type 0x%x\n", +- ifname, pkt_type_str(ptype), proto); ++ tm_print("%-7s %-3s Unknown network protocol type 0x%x\n", ++ ifname, pkt_type_str(ptype), proto); + } + } + +@@ -1165,8 +1195,9 @@ void traffic_monitor_stop(struct tmonitor_ctx *ctx) + write(ctx->wake_fd, &w, sizeof(w)); + pthread_join(ctx->thread, NULL); + +- printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); ++ tm_print("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); + + traffic_monitor_release(ctx); + } ++ + #endif /* TRAFFIC_MONITOR */ +diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h +index 9f6e05d886c5..a4e20c12c57e 100644 +--- a/tools/testing/selftests/bpf/network_helpers.h ++++ b/tools/testing/selftests/bpf/network_helpers.h +@@ -17,6 +17,7 @@ typedef __u16 __sum16; + #include + #include + #include ++#include + + #define MAGIC_VAL 0x1234 + #define NUM_ITER 100000 +@@ -249,10 +250,13 @@ static inline __sum16 build_udp_v6_csum(const struct ipv6hdr *ip6h, + + struct tmonitor_ctx; + ++typedef int (*tm_print_fn_t)(const char *format, va_list args); ++ + #ifdef TRAFFIC_MONITOR + struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name); + void traffic_monitor_stop(struct tmonitor_ctx *ctx); ++tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn); + #else + static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, + const char *subtest_name) +@@ -263,6 +267,11 @@ static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, cons + static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx) + { + } ++ ++static inline tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) ++{ ++ return NULL; ++} + #endif + + #endif +-- +2.48.1 + diff --git a/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch b/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch new file mode 100644 index 0000000000000..ad9f297ed14e9 --- /dev/null +++ b/ci/diffs/3004-selftests-bpf-Fix-dangling-stdout-seen-by-traffic-mo.patch @@ -0,0 +1,131 @@ +From 8bda5b787dea43a7102d5797756c60d0a905932a Mon Sep 17 00:00:00 2001 +From: Amery Hung +Date: Wed, 5 Mar 2025 10:20:57 -0800 +Subject: [PATCH] selftests/bpf: Fix dangling stdout seen by traffic monitor + thread + +Traffic monitor thread may see dangling stdout as the main thread closes +and reassigns stdout without protection. This happens when the main thread +finishes one subtest and moves to another one in the same netns_new() +scope. + +The issue can be reproduced by running test_progs repeatedly with traffic +monitor enabled: + +for ((i=1;i<=100;i++)); do + ./test_progs -a flow_dissector_skb* -m '*' +done + +For restoring stdout in crash_handler(), since it does not really care +about closing stdout, simlpy flush stdout and restore it to the original +one. + +Then, Fix the issue by consolidating stdio_restore_cleanup() and +stdio_restore(), and protecting the use/close/assignment of stdout with +a lock. The locking in the main thread is always performed regradless of +whether traffic monitor is running or not for simplicity. It won't have +any side-effect. + +Signed-off-by: Amery Hung +Signed-off-by: Martin KaFai Lau +Acked-by: Eduard Zingerman +Link: https://patch.msgid.link/20250305182057.2802606-3-ameryhung@gmail.com +Signed-off-by: Alexei Starovoitov +--- + tools/testing/selftests/bpf/test_progs.c | 39 +++++++++++++----------- + 1 file changed, 22 insertions(+), 17 deletions(-) + +diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c +index ab0f2fed3c58..d4ec9586b98c 100644 +--- a/tools/testing/selftests/bpf/test_progs.c ++++ b/tools/testing/selftests/bpf/test_progs.c +@@ -88,7 +88,9 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt) + #endif + } + +-static void stdio_restore_cleanup(void) ++static pthread_mutex_t stdout_lock = PTHREAD_MUTEX_INITIALIZER; ++ ++static void stdio_restore(void) + { + #ifdef __GLIBC__ + if (verbose() && env.worker_id == -1) { +@@ -98,6 +100,8 @@ static void stdio_restore_cleanup(void) + + fflush(stdout); + ++ pthread_mutex_lock(&stdout_lock); ++ + if (env.subtest_state) { + fclose(env.subtest_state->stdout_saved); + env.subtest_state->stdout_saved = NULL; +@@ -106,26 +110,21 @@ static void stdio_restore_cleanup(void) + } else { + fclose(env.test_state->stdout_saved); + env.test_state->stdout_saved = NULL; ++ stdout = env.stdout_saved; ++ stderr = env.stderr_saved; + } ++ ++ pthread_mutex_unlock(&stdout_lock); + #endif + } + +-static void stdio_restore(void) ++static int traffic_monitor_print_fn(const char *format, va_list args) + { +-#ifdef __GLIBC__ +- if (verbose() && env.worker_id == -1) { +- /* nothing to do, output to stdout by default */ +- return; +- } +- +- if (stdout == env.stdout_saved) +- return; +- +- stdio_restore_cleanup(); ++ pthread_mutex_lock(&stdout_lock); ++ vfprintf(stdout, format, args); ++ pthread_mutex_unlock(&stdout_lock); + +- stdout = env.stdout_saved; +- stderr = env.stderr_saved; +-#endif ++ return 0; + } + + /* Adapted from perf/util/string.c */ +@@ -536,7 +535,8 @@ void test__end_subtest(void) + test_result(subtest_state->error_cnt, + subtest_state->skipped)); + +- stdio_restore_cleanup(); ++ stdio_restore(); ++ + env.subtest_state = NULL; + } + +@@ -1265,7 +1265,10 @@ void crash_handler(int signum) + + sz = backtrace(bt, ARRAY_SIZE(bt)); + +- stdio_restore(); ++ fflush(stdout); ++ stdout = env.stdout_saved; ++ stderr = env.stderr_saved; ++ + if (env.test) { + env.test_state->error_cnt++; + dump_test_log(env.test, env.test_state, true, false, NULL); +@@ -1957,6 +1960,8 @@ int main(int argc, char **argv) + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + libbpf_set_print(libbpf_print_fn); + ++ traffic_monitor_set_print(traffic_monitor_print_fn); ++ + srand(time(NULL)); + + env.jit_enabled = is_jit_enabled(); +-- +2.48.1 + diff --git a/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch new file mode 100644 index 0000000000000..31bc3fb4717ec --- /dev/null +++ b/ci/diffs/8888-netfs-Fix-setting-NETFS_RREQ_ALL_QUEUED-to-be-after-.patch @@ -0,0 +1,100 @@ +From a99b3ef16d4473f8f9b7547f89791d037112a7e6 Mon Sep 17 00:00:00 2001 +From: David Howells +Date: Wed, 12 Feb 2025 22:24:01 +0000 +Subject: [PATCH] netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all + subreqs queued + +Due to the code that queues a subreq on the active subrequest list getting +moved to netfs_issue_read(), the NETFS_RREQ_ALL_QUEUED flag may now get set +before the list-add actually happens. This is not a problem if the +collection worker happens after the list-add, but it's a race - and, for +9P, where the read from the server is synchronous and done in the +submitting thread, this is a lot more likely. + +The result is that, if the timing is wrong, a ref gets leaked because the +collector thinks that all the subreqs have completed (because it can't see +the last one yet) and clears NETFS_RREQ_IN_PROGRESS - at which point, the +collection worker no longer goes into the collector. + +This can be provoked with AFS by injecting an msleep() right before the +final subreq is queued. + +Fix this by splitting the queuing part out of netfs_issue_read() into a new +function, netfs_queue_read(), and calling it separately. The setting of +NETFS_RREQ_ALL_QUEUED is then done by netfs_queue_read() whilst it is +holding the spinlock (that's probably unnecessary, but shouldn't hurt). + +It might be better to set a flag on the final subreq, but this could be a +problem if an error occurs and we can't queue it. + +Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") +Reported-by: Ihor Solodrai +Closes: https://lore.kernel.org/r/a7x33d4dnMdGTtRivptq6S1i8btK70SNBP2XyX_xwDAhLvgQoPox6FVBOkifq4eBinfFfbZlIkMZBe3QarlWTxoEtHZwJCZbNKtaqrR7PvI=@pm.me/ +Signed-off-by: David Howells +Tested-by: Ihor Solodrai +cc: Eric Van Hensbergen +cc: Latchesar Ionkov +cc: Dominique Martinet +cc: Christian Schoenebeck +cc: Marc Dionne +cc: Steve French +cc: Paulo Alcantara +cc: Jeff Layton +cc: v9fs@lists.linux.dev +cc: linux-cifs@vger.kernel.org +cc: netfs@lists.linux.dev +cc: linux-fsdevel@vger.kernel.org +--- + fs/netfs/buffered_read.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c +index f761d44b3436..0d1b6d35ff3b 100644 +--- a/fs/netfs/buffered_read.c ++++ b/fs/netfs/buffered_read.c +@@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, + netfs_cache_read_terminated, subreq); + } + +-static void netfs_issue_read(struct netfs_io_request *rreq, +- struct netfs_io_subrequest *subreq) ++static void netfs_queue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq, ++ bool last_subreq) + { + struct netfs_io_stream *stream = &rreq->io_streams[0]; + +@@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq, + } + } + ++ if (last_subreq) { ++ smp_wmb(); /* Write lists before ALL_QUEUED. */ ++ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); ++ } ++ + spin_unlock(&rreq->lock); ++} + ++static void netfs_issue_read(struct netfs_io_request *rreq, ++ struct netfs_io_subrequest *subreq) ++{ + switch (subreq->source) { + case NETFS_DOWNLOAD_FROM_SERVER: + rreq->netfs_ops->issue_read(subreq); +@@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) + } + size -= slice; + start += slice; +- if (size <= 0) { +- smp_wmb(); /* Write lists before ALL_QUEUED. */ +- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); +- } + ++ netfs_queue_read(rreq, subreq, size <= 0); + netfs_issue_read(rreq, subreq); + cond_resched(); + } while (size > 0); +-- +2.48.1 + diff --git a/ci/vmtest/configs/DENYLIST b/ci/vmtest/configs/DENYLIST new file mode 100644 index 0000000000000..20a090295a607 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST @@ -0,0 +1,17 @@ +# TEMPORARY +btf_dump/btf_dump: syntax +kprobe_multi_bench_attach +core_reloc/enum64val +core_reloc/size___diff_sz +core_reloc/type_based___diff_sz +test_ima # All of CI is broken on it following 6.3-rc1 merge +lwt_reroute # crashes kernel after netnext merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +tc_links_ingress # started failing after net-next merge from 2ab1efad60ad "net/sched: cls_api: complement tcf_tfilter_dump_policy" +xdp_bonding/xdp_bonding_features # started failing after net merge from 359e54a93ab4 "l2tp: pass correct message length to ip6_append_data" +tc_redirect/tc_redirect_dtime # uapi breakage after net-next commit 885c36e59f46 ("net: Re-use and set mono_delivery_time bit for userspace tstamp packets") +migrate_reuseport/IPv4 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +migrate_reuseport/IPv6 TCP_NEW_SYN_RECV reqsk_timer_handler # flaky, under investigation +connect_force_port # unreliably fails +sockmap_ktls/sockmap_ktls disconnect_after_delete* # https://lore.kernel.org/bpf/20250415163332.1836826-1-ihor.solodrai@linux.dev/ +verif_scale_pyperf600 # llvm 20 generates code that fails verification +arena_spin_lock # llvm 20 generates code that fails verification diff --git a/ci/vmtest/configs/DENYLIST.aarch64 b/ci/vmtest/configs/DENYLIST.aarch64 new file mode 100644 index 0000000000000..bdce99f3855ec --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.aarch64 @@ -0,0 +1,5 @@ +cgrp_local_storage # libbpf: prog 'update_cookie_tracing': failed to attach: ERROR: strerror_r(-524)=22 +core_reloc_btfgen # run_core_reloc_tests:FAIL:run_btfgen unexpected error: 32512 (errno 22) +usdt/multispec # usdt_300_bad_attach unexpected pointer: 0x558c63d8f0 +xdp_bonding # whole test suite is very unstable on aarch64 +res_spin_lock_success # flaky diff --git a/ci/vmtest/configs/DENYLIST.rc b/ci/vmtest/configs/DENYLIST.rc new file mode 100644 index 0000000000000..8aa33e6b71443 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.rc @@ -0,0 +1,3 @@ +send_signal/send_signal_nmi # PMU events configure correctly but don't trigger NMI's for some reason (AMD nested virt) +send_signal/send_signal_nmi_thread # Same as above +token/obj_priv_implicit_token_envvar # Unknown root cause, but reliably fails diff --git a/ci/vmtest/configs/DENYLIST.s390x b/ci/vmtest/configs/DENYLIST.s390x new file mode 100644 index 0000000000000..9b90b615aea55 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.s390x @@ -0,0 +1,11 @@ +deny_namespace # not yet in bpf denylist +tc_redirect/tc_redirect_dtime # very flaky +lru_bug # not yet in bpf-next denylist +# Disabled temporarily for a crash. +# https://lore.kernel.org/bpf/c9923c1d-971d-4022-8dc8-1364e929d34c@gmail.com/ +dummy_st_ops/dummy_init_ptr_arg +fexit_bpf2bpf +tailcalls +trace_ext +xdp_bpf2bpf +xdp_metadata diff --git a/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc new file mode 100644 index 0000000000000..a3c745d1f5b52 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.test_progs-bpf_gcc @@ -0,0 +1,904 @@ +arena_htab +async_stack_depth +bad_struct_ops/invalid_prog_reuse +bpf_cookie +bpf_iter/bpf_hash_map +bpf_iter/ksym +bpf_iter/tcp4 +bpf_iter/tcp6 +bpf_iter/udp4 +bpf_iter/udp6 +bpf_iter/unix +bpf_iter_setsockopt +bpf_iter_setsockopt_unix +bpf_mod_race +bpf_nf/tc-bpf-ct +bpf_nf/xdp-ct +bpf_tcp_ca/cubic +btf_dump/btf_dump: bitfields +btf_dump/btf_dump: packing +btf_dump/btf_dump: padding +btf_dump/btf_dump: syntax +btf_map_in_map +cb_refs +cgroup_get_current_cgroup_id +cgroup_iter/cgroup_iter__self_only_css_task +cgroup_tcp_skb +cgrp_kfunc +cls_redirect/cls_redirect_dynptr +connect_force_port +core_autosize +core_read_macros +core_reloc/type_id +core_reloc/type_id___missing_targets +core_reloc_btfgen/type_id +core_reloc_btfgen/type_id___missing_targets +cpumask/test_acquire_wrong_cpumask +cpumask/test_alloc_double_release +cpumask/test_alloc_free_cpumask +cpumask/test_alloc_no_release +cpumask/test_and_or_xor +cpumask/test_copy_any_anyand +cpumask/test_cpumask_null +cpumask/test_cpumask_weight +cpumask/test_first_firstzero_cpu +cpumask/test_firstand_nocpu +cpumask/test_global_mask_array_l2_rcu +cpumask/test_global_mask_array_one_rcu +cpumask/test_global_mask_array_rcu +cpumask/test_global_mask_nested_deep_array_rcu +cpumask/test_global_mask_nested_deep_rcu +cpumask/test_global_mask_nested_rcu +cpumask/test_global_mask_no_null_check +cpumask/test_global_mask_out_of_rcu +cpumask/test_global_mask_rcu +cpumask/test_global_mask_rcu_no_null_check +cpumask/test_insert_leave +cpumask/test_insert_remove_no_release +cpumask/test_insert_remove_release +cpumask/test_intersects_subset +cpumask/test_invalid_nested_array +cpumask/test_mutate_cpumask +cpumask/test_set_clear_cpu +cpumask/test_setall_clear_cpu +cpumask/test_test_and_set_clear +crypto_basic/crypto_acquire +crypto_sanity +deny_namespace +dummy_st_ops/test_unsupported_field_sleepable +dynptr/add_dynptr_to_map1 +dynptr/add_dynptr_to_map2 +dynptr/clone_invalid1 +dynptr/clone_invalid2 +dynptr/clone_invalidate1 +dynptr/clone_invalidate2 +dynptr/clone_invalidate3 +dynptr/clone_invalidate4 +dynptr/clone_invalidate5 +dynptr/clone_invalidate6 +dynptr/clone_skb_packet_data +dynptr/clone_xdp_packet_data +dynptr/data_slice_missing_null_check1 +dynptr/data_slice_missing_null_check2 +dynptr/data_slice_out_of_bounds_map_value +dynptr/data_slice_out_of_bounds_ringbuf +dynptr/data_slice_out_of_bounds_skb +dynptr/data_slice_use_after_release1 +dynptr/data_slice_use_after_release2 +dynptr/dynptr_adjust_invalid +dynptr/dynptr_from_mem_invalid_api +dynptr/dynptr_invalidate_slice_failure +dynptr/dynptr_invalidate_slice_or_null +dynptr/dynptr_invalidate_slice_reinit +dynptr/dynptr_is_null_invalid +dynptr/dynptr_is_rdonly_invalid +dynptr/dynptr_overwrite_ref +dynptr/dynptr_partial_slot_invalidate +dynptr/dynptr_pruning_overwrite +dynptr/dynptr_pruning_type_confusion +dynptr/dynptr_read_into_slot +dynptr/dynptr_size_invalid +dynptr/dynptr_slice_var_len1 +dynptr/dynptr_slice_var_len2 +dynptr/dynptr_var_off_overwrite +dynptr/global +dynptr/invalid_data_slices +dynptr/invalid_helper1 +dynptr/invalid_helper2 +dynptr/invalid_offset +dynptr/invalid_read1 +dynptr/invalid_read2 +dynptr/invalid_read3 +dynptr/invalid_read4 +dynptr/invalid_slice_rdwr_rdonly +dynptr/invalid_write1 +dynptr/invalid_write2 +dynptr/invalid_write3 +dynptr/invalid_write4 +dynptr/release_twice +dynptr/release_twice_callback +dynptr/ringbuf_invalid_api +dynptr/ringbuf_missing_release1 +dynptr/ringbuf_missing_release2 +dynptr/ringbuf_missing_release_callback +dynptr/ringbuf_release_uninit_dynptr +dynptr/skb_invalid_ctx +dynptr/skb_invalid_ctx_fentry +dynptr/skb_invalid_ctx_fexit +dynptr/skb_invalid_data_slice1 +dynptr/skb_invalid_data_slice2 +dynptr/skb_invalid_data_slice3 +dynptr/skb_invalid_data_slice4 +dynptr/skb_invalid_slice_write +dynptr/test_dynptr_reg_type +dynptr/test_dynptr_skb_no_buff +dynptr/test_dynptr_skb_small_buff +dynptr/test_dynptr_skb_tp_btf +dynptr/test_read_write +dynptr/uninit_write_into_slot +dynptr/use_after_invalid +dynptr/xdp_invalid_ctx +dynptr/xdp_invalid_data_slice1 +dynptr/xdp_invalid_data_slice2 +exceptions/check_assert_eq_int_max +exceptions/check_assert_eq_int_min +exceptions/check_assert_eq_llong_max +exceptions/check_assert_eq_llong_min +exceptions/check_assert_eq_zero +exceptions/check_assert_ge_neg +exceptions/check_assert_ge_pos +exceptions/check_assert_ge_zero +exceptions/check_assert_generic +exceptions/check_assert_gt_neg +exceptions/check_assert_gt_pos +exceptions/check_assert_gt_zero +exceptions/check_assert_le_neg +exceptions/check_assert_le_pos +exceptions/check_assert_le_zero +exceptions/check_assert_lt_neg +exceptions/check_assert_lt_pos +exceptions/check_assert_lt_zero +exceptions/check_assert_range_s64 +exceptions/check_assert_range_u64 +exceptions/check_assert_single_range_s64 +exceptions/check_assert_single_range_u64 +exceptions/check_assert_with_return +exceptions/exception_ext +exceptions/exception_ext_mod_cb_runtime +exceptions/non-throwing extension -> non-throwing subprog +exceptions/non-throwing extension -> throwing global subprog +exceptions/non-throwing fentry -> exception_cb +exceptions/non-throwing fexit -> exception_cb +exceptions/non-throwing fmod_ret -> non-throwing global subprog +exceptions/reject_async_callback_throw +exceptions/reject_exception_throw_cb +exceptions/reject_exception_throw_cb_diff +exceptions/reject_set_exception_cb_bad_ret2 +exceptions/reject_subprog_with_lock +exceptions/reject_subprog_with_rcu_read_lock +exceptions/reject_with_cb +exceptions/reject_with_cb_reference +exceptions/reject_with_lock +exceptions/reject_with_rbtree_add_throw +exceptions/reject_with_rcu_read_lock +exceptions/reject_with_reference +exceptions/reject_with_subprog_reference +exceptions/throwing extension (with custom cb) -> exception_cb +exceptions/throwing extension -> global func in exception_cb +exceptions/throwing extension -> non-throwing global subprog +exceptions/throwing extension -> throwing global subprog +exceptions/throwing fentry -> exception_cb +exceptions/throwing fexit -> exception_cb +failures_wq +fexit_bpf2bpf/fmod_ret_freplace +fexit_bpf2bpf/func_replace +fexit_bpf2bpf/func_replace_global_func +fexit_bpf2bpf/func_replace_multi +fexit_bpf2bpf/func_sockmap_update +fexit_bpf2bpf/target_yes_callees +global_func_dead_code +global_map_resize +inner_array_lookup +irq/irq_flag_overwrite +irq/irq_flag_overwrite_partial +irq/irq_global_subprog +irq/irq_ooo_refs_array +irq/irq_restore_4_subprog +irq/irq_restore_bad_arg +irq/irq_restore_invalid +irq/irq_restore_iter +irq/irq_restore_missing_1_subprog +irq/irq_restore_missing_2 +irq/irq_restore_missing_2_subprog +irq/irq_restore_missing_3 +irq/irq_restore_missing_3_minus_2 +irq/irq_restore_missing_3_minus_2_subprog +irq/irq_restore_missing_3_subprog +irq/irq_restore_ooo +irq/irq_restore_ooo_3 +irq/irq_restore_ooo_3_subprog +irq/irq_save_bad_arg +irq/irq_save_invalid +irq/irq_save_iter +irq/irq_sleepable_helper +irq/irq_sleepable_kfunc +iters/compromise_iter_w_direct_write_and_skip_destroy_fail +iters/compromise_iter_w_direct_write_fail +iters/compromise_iter_w_helper_write_fail +iters/create_and_forget_to_destroy_fail +iters/css_task +iters/delayed_precision_mark +iters/delayed_read_mark +iters/destroy_without_creating_fail +iters/double_create_fail +iters/double_destroy_fail +iters/iter_css_lock_and_unlock +iters/iter_css_task_for_each +iters/iter_css_without_lock +iters/iter_destroy_bad_arg +iters/iter_err_too_permissive1 +iters/iter_err_too_permissive2 +iters/iter_err_too_permissive3 +iters/iter_err_unsafe_asm_loop +iters/iter_err_unsafe_c_loop +iters/iter_nested_iters +iters/iter_new_bad_arg +iters/iter_next_bad_arg +iters/iter_next_ptr_mem_not_trusted +iters/iter_next_rcu_not_trusted +iters/iter_next_rcu_or_null +iters/iter_next_trusted_or_null +iters/iter_obfuscate_counter +iters/iter_subprog_iters +iters/iter_tasks_lock_and_unlock +iters/iter_tasks_without_lock +iters/leak_iter_from_subprog_fail +iters/loop_state_deps1 +iters/loop_state_deps2 +iters/missing_null_check_fail +iters/next_after_destroy_fail +iters/next_without_new_fail +iters/read_from_iter_slot_fail +iters/stacksafe_should_not_conflate_stack_spill_and_iter +iters/testmod_seq_getter_after_bad +iters/testmod_seq_getter_before_bad +iters/wrong_sized_read_fail +jeq_infer_not_null +jit_probe_mem +kfree_skb +kfunc_call/kfunc_call_ctx +kfunc_call/kfunc_call_test1 +kfunc_call/kfunc_call_test2 +kfunc_call/kfunc_call_test4 +kfunc_call/kfunc_call_test_get_mem +kfunc_call/kfunc_call_test_ref_btf_id +kfunc_call/kfunc_call_test_static_unused_arg +kfunc_call/kfunc_syscall_test +kfunc_call/kfunc_syscall_test_null +kfunc_dynptr_param/not_ptr_to_stack +kfunc_dynptr_param/not_valid_dynptr +kfunc_param_nullable/kfunc_dynptr_nullable_test3 +kprobe_multi_test/kprobe_session_return_2 +kptr_xchg_inline +l4lb_all/l4lb_noinline +l4lb_all/l4lb_noinline_dynptr +linked_list +local_kptr_stash/drop_rb_node_off +local_kptr_stash/local_kptr_stash_local_with_root +local_kptr_stash/local_kptr_stash_plain +local_kptr_stash/local_kptr_stash_simple +local_kptr_stash/local_kptr_stash_unstash +local_kptr_stash/refcount_acquire_without_unstash +local_kptr_stash/stash_rb_nodes +log_buf/obj_load_log_buf +log_fixup/bad_core_relo_subprog +log_fixup/bad_core_relo_trunc_full +lru_bug +map_btf +map_in_map/acc_map_in_array +map_in_map/acc_map_in_htab +map_in_map/sleepable_acc_map_in_array +map_in_map/sleepable_acc_map_in_htab +map_kptr/correct_btf_id_check_size +map_kptr/inherit_untrusted_on_walk +map_kptr/kptr_xchg_possibly_null +map_kptr/kptr_xchg_ref_state +map_kptr/mark_ref_as_untrusted_or_null +map_kptr/marked_as_untrusted_or_null +map_kptr/non_const_var_off +map_kptr/non_const_var_off_kptr_xchg +map_kptr/reject_bad_type_xchg +map_kptr/reject_kptr_xchg_on_unref +map_kptr/reject_member_of_ref_xchg +map_kptr/reject_untrusted_xchg +map_kptr/success-map +map_ptr +nested_trust/test_invalid_nested_user_cpus +nested_trust/test_invalid_skb_field +percpu_alloc/array +percpu_alloc/array_sleepable +percpu_alloc/cgrp_local_storage +percpu_alloc/test_array_map_1 +percpu_alloc/test_array_map_2 +percpu_alloc/test_array_map_3 +percpu_alloc/test_array_map_4 +percpu_alloc/test_array_map_5 +percpu_alloc/test_array_map_6 +percpu_alloc/test_array_map_7 +percpu_alloc/test_array_map_8 +perf_branches/perf_branches_no_hw +pkt_access +preempt_lock/preempt_global_subprog_test +preempt_lock/preempt_lock_missing_1 +preempt_lock/preempt_lock_missing_1_subprog +preempt_lock/preempt_lock_missing_2 +preempt_lock/preempt_lock_missing_2_minus_1_subprog +preempt_lock/preempt_lock_missing_2_subprog +preempt_lock/preempt_lock_missing_3 +preempt_lock/preempt_lock_missing_3_minus_2 +preempt_lock/preempt_sleepable_helper +preempt_lock/preempt_sleepable_kfunc +preempted_bpf_ma_op +prog_run_opts +prog_tests_framework +raw_tp_null +rbtree_fail +rbtree_success +recursion +refcounted_kptr +refcounted_kptr_fail +refcounted_kptr_wrong_owner +reference_tracking/sk_lookup_success +ringbuf_multi +setget_sockopt +sk_lookup +skc_to_unix_sock +sock_addr/recvmsg4: attach prog with wrong attach type +sock_addr/recvmsg4: recvfrom (dgram) +sock_addr/recvmsg6: attach prog with wrong attach type +sock_addr/recvmsg6: recvfrom (dgram) +sock_addr/sendmsg4: attach prog with wrong attach type +sock_addr/sendmsg4: kernel_sendmsg (dgram) +sock_addr/sendmsg4: kernel_sendmsg deny (dgram) +sock_addr/sendmsg4: sendmsg (dgram) +sock_addr/sendmsg4: sendmsg deny (dgram) +sock_addr/sendmsg4: sock_sendmsg (dgram) +sock_addr/sendmsg4: sock_sendmsg deny (dgram) +sock_addr/sendmsg6: attach prog with wrong attach type +sock_addr/sendmsg6: kernel_sendmsg (dgram) +sock_addr/sendmsg6: kernel_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: kernel_sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg (dgram) +sock_addr/sendmsg6: sendmsg IPv4-mapped IPv6 (dgram) +sock_addr/sendmsg6: sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sendmsg deny (dgram) +sock_addr/sendmsg6: sendmsg dst IP = [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg (dgram) +sock_addr/sendmsg6: sock_sendmsg [::] (BSD'ism) (dgram) +sock_addr/sendmsg6: sock_sendmsg deny (dgram) +sock_destroy/trace_tcp_destroy_sock +sock_fields +sockmap_listen/sockhash IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockhash IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockhash IPv6 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv4 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv4 UDP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 TCP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 TCP test_reuseport_select_connected +sockmap_listen/sockmap IPv6 UDP test_reuseport_mixed_groups +sockmap_listen/sockmap IPv6 UDP test_reuseport_select_connected +spin_lock +struct_ops_module/unsupported_ops +syscall +tailcalls/classifier_0 +tailcalls/classifier_1 +tailcalls/reject_tail_call_preempt_lock +tailcalls/reject_tail_call_rcu_lock +tailcalls/reject_tail_call_ref +tailcalls/reject_tail_call_spin_lock +tailcalls/tailcall_6 +tailcalls/tailcall_bpf2bpf_2 +tailcalls/tailcall_bpf2bpf_3 +tailcalls/tailcall_bpf2bpf_fentry +tailcalls/tailcall_bpf2bpf_fentry_entry +tailcalls/tailcall_bpf2bpf_fentry_fexit +tailcalls/tailcall_bpf2bpf_fexit +tailcalls/tailcall_bpf2bpf_hierarchy_2 +tailcalls/tailcall_bpf2bpf_hierarchy_3 +task_kfunc +task_local_storage/uptr_across_pages +task_local_storage/uptr_basic +task_local_storage/uptr_kptr_xchg +task_local_storage/uptr_map_failure_e2big +task_local_storage/uptr_map_failure_kstruct +task_local_storage/uptr_map_failure_size0 +task_local_storage/uptr_no_null_check +task_local_storage/uptr_obj_new +task_local_storage/uptr_update_failure +tc_bpf/tc_bpf_non_root +tc_redirect/tc_redirect_dtime +tcp_custom_syncookie +tcp_hdr_options +test_bpf_ma +test_global_funcs/arg_tag_ctx_kprobe +test_global_funcs/arg_tag_ctx_perf +test_global_funcs/arg_tag_ctx_raw_tp +test_global_funcs/global_func1 +test_global_funcs/global_func10 +test_global_funcs/global_func11 +test_global_funcs/global_func12 +test_global_funcs/global_func13 +test_global_funcs/global_func14 +test_global_funcs/global_func15 +test_global_funcs/global_func15_tricky_pruning +test_global_funcs/global_func17 +test_global_funcs/global_func3 +test_global_funcs/global_func5 +test_global_funcs/global_func6 +test_global_funcs/global_func7 +test_lsm/lsm_basic +test_profiler +test_strncmp/strncmp_bad_not_null_term_target +timer +timer_mim +token +tp_btf_nullable/handle_tp_btf_nullable_bare1 +tunnel +uprobe_multi_test/uprobe_sesison_return_2 +user_ringbuf/user_ringbuf_callback_bad_access1 +user_ringbuf/user_ringbuf_callback_bad_access2 +user_ringbuf/user_ringbuf_callback_const_ptr_to_dynptr_reg_off +user_ringbuf/user_ringbuf_callback_discard_dynptr +user_ringbuf/user_ringbuf_callback_invalid_return +user_ringbuf/user_ringbuf_callback_null_context_read +user_ringbuf/user_ringbuf_callback_null_context_write +user_ringbuf/user_ringbuf_callback_reinit_dynptr_mem +user_ringbuf/user_ringbuf_callback_reinit_dynptr_ringbuf +user_ringbuf/user_ringbuf_callback_submit_dynptr +user_ringbuf/user_ringbuf_callback_write_forbidden +verif_scale_pyperf100 +verif_scale_pyperf180 +verif_scale_pyperf600 +verif_scale_pyperf600_nounroll +verif_scale_seg6_loop +verif_scale_strobemeta +verif_scale_strobemeta_nounroll1 +verif_scale_strobemeta_nounroll2 +verif_scale_strobemeta_subprogs +verif_scale_sysctl_loop1 +verif_scale_sysctl_loop2 +verif_scale_xdp_loop +verifier_and/invalid_and_of_negative_number +verifier_and/invalid_range_check +verifier_arena/iter_maps2 +verifier_arena/iter_maps3 +verifier_array_access/a_read_only_array_1_2 +verifier_array_access/a_read_only_array_2_2 +verifier_array_access/a_write_only_array_1_2 +verifier_array_access/a_write_only_array_2_2 +verifier_array_access/an_array_with_a_constant_2 +verifier_array_access/an_array_with_a_register_2 +verifier_array_access/an_array_with_a_variable_2 +verifier_array_access/array_with_no_floor_check +verifier_array_access/with_a_invalid_max_check_1 +verifier_array_access/with_a_invalid_max_check_2 +verifier_basic_stack/invalid_fp_arithmetic +verifier_basic_stack/misaligned_read_from_stack +verifier_basic_stack/stack_out_of_bounds +verifier_bitfield_write +verifier_bits_iter/destroy_uninit +verifier_bits_iter/next_uninit +verifier_bits_iter/no_destroy +verifier_bounds/bounds_map_value_variant_1 +verifier_bounds/bounds_map_value_variant_2 +verifier_bounds/of_boundary_crossing_range_1 +verifier_bounds/of_boundary_crossing_range_2 +verifier_bounds/on_sign_extended_mov_test1 +verifier_bounds/on_sign_extended_mov_test2 +verifier_bounds/reg32_any_reg32_xor_3 +verifier_bounds/reg_any_reg_xor_3 +verifier_bounds/shift_of_maybe_negative_number +verifier_bounds/shift_with_64_bit_input +verifier_bounds/shift_with_oversized_count_operand +verifier_bounds/size_signed_32bit_overflow_test1 +verifier_bounds/size_signed_32bit_overflow_test2 +verifier_bounds/size_signed_32bit_overflow_test3 +verifier_bounds/size_signed_32bit_overflow_test4 +verifier_bounds/var_off_insn_off_test1 +verifier_bounds/var_off_insn_off_test2 +verifier_bounds_deduction/deducing_bounds_from_const_1 +verifier_bounds_deduction/deducing_bounds_from_const_10 +verifier_bounds_deduction/deducing_bounds_from_const_3 +verifier_bounds_deduction/deducing_bounds_from_const_5 +verifier_bounds_deduction/deducing_bounds_from_const_6 +verifier_bounds_deduction/deducing_bounds_from_const_7 +verifier_bounds_deduction/deducing_bounds_from_const_8 +verifier_bounds_deduction/deducing_bounds_from_const_9 +verifier_bounds_mix_sign_unsign/checks_mixing_signed_and_unsigned +verifier_bounds_mix_sign_unsign/signed_and_unsigned_positive_bounds +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_10 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_11 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_12 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_13 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_14 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_15 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_2 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_3 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_5 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_6 +verifier_bounds_mix_sign_unsign/signed_and_unsigned_variant_8 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_16 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_32 +verifier_btf_ctx_access/ctx_access_u32_pointer_reject_8 +verifier_cfg/conditional_loop +verifier_cfg/loop2_back_edge +verifier_cfg/loop_back_edge +verifier_cfg/out_of_range_jump +verifier_cfg/out_of_range_jump2 +verifier_cfg/uncond_loop_after_cond_jmp +verifier_cfg/uncond_loop_in_subprog_after_cond_jmp +verifier_cfg/unreachable +verifier_cfg/unreachable2 +verifier_cgroup_inv_retcode/with_invalid_return_code_test1 +verifier_cgroup_inv_retcode/with_invalid_return_code_test3 +verifier_cgroup_inv_retcode/with_invalid_return_code_test5 +verifier_cgroup_inv_retcode/with_invalid_return_code_test6 +verifier_cgroup_inv_retcode/with_invalid_return_code_test7 +verifier_cgroup_skb/data_meta_for_cgroup_skb +verifier_cgroup_skb/flow_keys_for_cgroup_skb +verifier_cgroup_skb/napi_id_for_cgroup_skb +verifier_cgroup_skb/tc_classid_for_cgroup_skb +verifier_cgroup_storage/cpu_cgroup_storage_access_1 +verifier_cgroup_storage/cpu_cgroup_storage_access_2 +verifier_cgroup_storage/cpu_cgroup_storage_access_3 +verifier_cgroup_storage/cpu_cgroup_storage_access_4 +verifier_cgroup_storage/cpu_cgroup_storage_access_5 +verifier_cgroup_storage/cpu_cgroup_storage_access_6 +verifier_cgroup_storage/invalid_cgroup_storage_access_1 +verifier_cgroup_storage/invalid_cgroup_storage_access_2 +verifier_cgroup_storage/invalid_cgroup_storage_access_3 +verifier_cgroup_storage/invalid_cgroup_storage_access_4 +verifier_cgroup_storage/invalid_cgroup_storage_access_5 +verifier_cgroup_storage/invalid_cgroup_storage_access_6 +verifier_const/bprm +verifier_const/tcx1 +verifier_const/tcx4 +verifier_const/tcx7 +verifier_const_or/not_bypass_stack_boundary_checks_1 +verifier_const_or/not_bypass_stack_boundary_checks_2 +verifier_ctx/context_stores_via_bpf_atomic +verifier_ctx/ctx_pointer_to_helper_1 +verifier_ctx/ctx_pointer_to_helper_2 +verifier_ctx/ctx_pointer_to_helper_3 +verifier_ctx/make_ptr_to_ctx_unusable +verifier_ctx/null_check_4_ctx_const +verifier_ctx/null_check_8_null_bind +verifier_ctx/or_null_check_3_1 +verifier_ctx_sk_msg/of_size_in_sk_msg +verifier_ctx_sk_msg/past_end_of_sk_msg +verifier_ctx_sk_msg/read_offset_in_sk_msg +verifier_d_path/d_path_reject +verifier_direct_packet_access/access_test15_spill_with_xadd +verifier_direct_packet_access/direct_packet_access_test3 +verifier_direct_packet_access/id_in_regsafe_bad_access +verifier_direct_packet_access/packet_access_test10_write_invalid +verifier_direct_packet_access/pkt_end_reg_bad_access +verifier_direct_packet_access/pkt_end_reg_both_accesses +verifier_direct_packet_access/test16_arith_on_data_end +verifier_direct_packet_access/test23_x_pkt_ptr_4 +verifier_direct_packet_access/test26_marking_on_bad_access +verifier_direct_packet_access/test28_marking_on_bad_access +verifier_direct_stack_access_wraparound +verifier_global_ptr_args +verifier_global_subprogs +verifier_helper_access_var_len/bitwise_and_jmp_wrong_max +verifier_helper_access_var_len/jmp_signed_no_min_check +verifier_helper_access_var_len/map_adjusted_jmp_wrong_max +verifier_helper_access_var_len/memory_map_jmp_wrong_max +verifier_helper_access_var_len/memory_stack_jmp_bounds_offset +verifier_helper_access_var_len/memory_stack_jmp_wrong_max +verifier_helper_access_var_len/ptr_to_mem_or_null_2 +verifier_helper_access_var_len/ptr_to_mem_or_null_8 +verifier_helper_access_var_len/ptr_to_mem_or_null_9 +verifier_helper_access_var_len/stack_jmp_no_max_check +verifier_helper_packet_access/cls_helper_fail_range_1 +verifier_helper_packet_access/cls_helper_fail_range_2 +verifier_helper_packet_access/cls_helper_fail_range_3 +verifier_helper_packet_access/packet_ptr_with_bad_range_1 +verifier_helper_packet_access/packet_ptr_with_bad_range_2 +verifier_helper_packet_access/packet_test2_unchecked_packet_ptr +verifier_helper_packet_access/ptr_with_too_short_range_1 +verifier_helper_packet_access/ptr_with_too_short_range_2 +verifier_helper_packet_access/test11_cls_unsuitable_helper_1 +verifier_helper_packet_access/test12_cls_unsuitable_helper_2 +verifier_helper_packet_access/test15_cls_helper_fail_sub +verifier_helper_packet_access/test20_pkt_end_as_input +verifier_helper_packet_access/test7_cls_unchecked_packet_ptr +verifier_helper_packet_access/to_packet_test21_wrong_reg +verifier_helper_restricted +verifier_helper_value_access/access_to_map_empty_range +verifier_helper_value_access/access_to_map_negative_range +verifier_helper_value_access/access_to_map_possibly_empty_range +verifier_helper_value_access/access_to_map_wrong_size +verifier_helper_value_access/bounds_check_using_bad_access_1 +verifier_helper_value_access/bounds_check_using_bad_access_2 +verifier_helper_value_access/check_using_s_bad_access_1 +verifier_helper_value_access/check_using_s_bad_access_2 +verifier_helper_value_access/const_imm_negative_range_adjustment_1 +verifier_helper_value_access/const_imm_negative_range_adjustment_2 +verifier_helper_value_access/const_reg_negative_range_adjustment_1 +verifier_helper_value_access/const_reg_negative_range_adjustment_2 +verifier_helper_value_access/imm_out_of_bound_1 +verifier_helper_value_access/imm_out_of_bound_2 +verifier_helper_value_access/imm_out_of_bound_range +verifier_helper_value_access/map_out_of_bound_range +verifier_helper_value_access/map_via_variable_empty_range +verifier_helper_value_access/reg_out_of_bound_1 +verifier_helper_value_access/reg_out_of_bound_2 +verifier_helper_value_access/reg_out_of_bound_range +verifier_helper_value_access/via_const_imm_empty_range +verifier_helper_value_access/via_const_reg_empty_range +verifier_helper_value_access/via_variable_no_max_check_1 +verifier_helper_value_access/via_variable_no_max_check_2 +verifier_helper_value_access/via_variable_wrong_max_check_1 +verifier_helper_value_access/via_variable_wrong_max_check_2 +verifier_int_ptr/arg_ptr_to_long_misaligned +verifier_int_ptr/to_long_size_sizeof_long +verifier_iterating_callbacks/bpf_loop_iter_limit_overflow +verifier_iterating_callbacks/check_add_const_3regs +verifier_iterating_callbacks/check_add_const_3regs_2if +verifier_iterating_callbacks/check_add_const_regsafe_off +verifier_iterating_callbacks/iter_limit_bug +verifier_iterating_callbacks/jgt_imm64_and_may_goto +verifier_iterating_callbacks/loop_detection +verifier_iterating_callbacks/may_goto_self +verifier_iterating_callbacks/unsafe_find_vma +verifier_iterating_callbacks/unsafe_for_each_map_elem +verifier_iterating_callbacks/unsafe_on_2nd_iter +verifier_iterating_callbacks/unsafe_on_zero_iter +verifier_iterating_callbacks/unsafe_ringbuf_drain +verifier_jeq_infer_not_null/unchanged_for_jeq_false_branch +verifier_jeq_infer_not_null/unchanged_for_jne_true_branch +verifier_kfunc_prog_types/cgrp_kfunc_raw_tp +verifier_kfunc_prog_types/cpumask_kfunc_raw_tp +verifier_kfunc_prog_types/task_kfunc_raw_tp +verifier_ld_ind/ind_check_calling_conv_r1 +verifier_ld_ind/ind_check_calling_conv_r2 +verifier_ld_ind/ind_check_calling_conv_r3 +verifier_ld_ind/ind_check_calling_conv_r4 +verifier_ld_ind/ind_check_calling_conv_r5 +verifier_leak_ptr/leak_pointer_into_ctx_1 +verifier_leak_ptr/leak_pointer_into_ctx_2 +verifier_linked_scalars +verifier_loops1/bounded_recursion +verifier_loops1/infinite_loop_in_two_jumps +verifier_loops1/infinite_loop_three_jump_trick +verifier_loops1/loop_after_a_conditional_jump +verifier_lsm/bool_retval_test3 +verifier_lsm/bool_retval_test4 +verifier_lsm/disabled_hook_test1 +verifier_lsm/disabled_hook_test2 +verifier_lsm/disabled_hook_test3 +verifier_lsm/errno_zero_retval_test4 +verifier_lsm/errno_zero_retval_test5 +verifier_lsm/errno_zero_retval_test6 +verifier_lwt/not_permitted_for_lwt_prog +verifier_lwt/packet_write_for_lwt_in +verifier_lwt/packet_write_for_lwt_out +verifier_lwt/tc_classid_for_lwt_in +verifier_lwt/tc_classid_for_lwt_out +verifier_lwt/tc_classid_for_lwt_xmit +verifier_map_in_map/invalid_inner_map_pointer +verifier_map_in_map/on_the_inner_map_pointer +verifier_map_ptr/bpf_map_ptr_write_rejected +verifier_map_ptr/read_non_existent_field_rejected +verifier_map_ptr/read_with_negative_offset_rejected +verifier_map_ptr_mixing +verifier_map_ret_val +verifier_meta_access/meta_access_test10 +verifier_meta_access/meta_access_test2 +verifier_meta_access/meta_access_test3 +verifier_meta_access/meta_access_test4 +verifier_meta_access/meta_access_test5 +verifier_meta_access/meta_access_test6 +verifier_meta_access/meta_access_test9 +verifier_netfilter_ctx/with_invalid_ctx_access_test1 +verifier_netfilter_ctx/with_invalid_ctx_access_test2 +verifier_netfilter_ctx/with_invalid_ctx_access_test3 +verifier_netfilter_ctx/with_invalid_ctx_access_test4 +verifier_netfilter_ctx/with_invalid_ctx_access_test5 +verifier_netfilter_retcode/with_invalid_return_code_test1 +verifier_netfilter_retcode/with_invalid_return_code_test4 +verifier_or_jmp32_k +verifier_prevent_map_lookup +verifier_raw_stack/bytes_spilled_regs_corruption_2 +verifier_raw_stack/load_bytes_invalid_access_1 +verifier_raw_stack/load_bytes_invalid_access_2 +verifier_raw_stack/load_bytes_invalid_access_3 +verifier_raw_stack/load_bytes_invalid_access_4 +verifier_raw_stack/load_bytes_invalid_access_5 +verifier_raw_stack/load_bytes_invalid_access_6 +verifier_raw_stack/load_bytes_negative_len_2 +verifier_raw_stack/load_bytes_spilled_regs_corruption +verifier_raw_stack/skb_load_bytes_negative_len +verifier_raw_stack/skb_load_bytes_zero_len +verifier_raw_tp_writable +verifier_ref_tracking +verifier_reg_equal/subreg_equality_2 +verifier_regalloc/regalloc_and_spill_negative +verifier_regalloc/regalloc_negative +verifier_regalloc/regalloc_src_reg_negative +verifier_ringbuf/ringbuf_invalid_reservation_offset_1 +verifier_ringbuf/ringbuf_invalid_reservation_offset_2 +verifier_runtime_jit +verifier_scalar_ids/check_ids_in_regsafe +verifier_scalar_ids/check_ids_in_regsafe_2 +verifier_scalar_ids/linked_regs_broken_link_2 +verifier_search_pruning/for_u32_spills_u64_fill +verifier_search_pruning/liveness_pruning_and_write_screening +verifier_search_pruning/short_loop1 +verifier_search_pruning/should_be_verified_nop_operation +verifier_search_pruning/tracking_for_u32_spill_fill +verifier_search_pruning/varlen_map_value_access_pruning +verifier_sock/bpf_sk_fullsock_skb_sk +verifier_sock/bpf_sk_release_skb_sk +verifier_sock/bpf_tcp_sock_skb_sk +verifier_sock/dst_port_byte_load_invalid +verifier_sock/dst_port_half_load_invalid_1 +verifier_sock/dst_port_half_load_invalid_2 +verifier_sock/invalidate_pkt_pointers_by_tail_call +verifier_sock/invalidate_pkt_pointers_from_global_func +verifier_sock/map_lookup_elem_smap_key +verifier_sock/map_lookup_elem_sockhash_key +verifier_sock/map_lookup_elem_sockmap_key +verifier_sock/no_null_check_on_ret_1 +verifier_sock/no_null_check_on_ret_2 +verifier_sock/of_bpf_skc_to_helpers +verifier_sock/post_bind4_read_mark +verifier_sock/post_bind4_read_src_ip6 +verifier_sock/post_bind6_read_src_ip4 +verifier_sock/sk_1_1_value_1 +verifier_sock/sk_no_skb_sk_check_1 +verifier_sock/sk_no_skb_sk_check_2 +verifier_sock/sk_sk_type_fullsock_field_1 +verifier_sock/skb_sk_beyond_last_field_1 +verifier_sock/skb_sk_beyond_last_field_2 +verifier_sock/skb_sk_no_null_check +verifier_sock/sock_create_read_src_port +verifier_sock_addr/bind4_bad_return_code +verifier_sock_addr/bind6_bad_return_code +verifier_sock_addr/connect4_bad_return_code +verifier_sock_addr/connect6_bad_return_code +verifier_sock_addr/connect_unix_bad_return_code +verifier_sock_addr/getpeername4_bad_return_code +verifier_sock_addr/getpeername6_bad_return_code +verifier_sock_addr/getpeername_unix_bad_return_code +verifier_sock_addr/getsockname4_bad_return_code +verifier_sock_addr/getsockname6_bad_return_code +verifier_sock_addr/getsockname_unix_unix_bad_return_code +verifier_sock_addr/recvmsg4_bad_return_code +verifier_sock_addr/recvmsg6_bad_return_code +verifier_sock_addr/recvmsg_unix_bad_return_code +verifier_sock_addr/sendmsg4_bad_return_code +verifier_sock_addr/sendmsg6_bad_return_code +verifier_sock_addr/sendmsg_unix_bad_return_code +verifier_sockmap_mutate/test_flow_dissector_update +verifier_sockmap_mutate/test_raw_tp_delete +verifier_sockmap_mutate/test_raw_tp_update +verifier_sockmap_mutate/test_sockops_update +verifier_spill_fill/_6_offset_to_skb_data +verifier_spill_fill/addr_offset_to_skb_data +verifier_spill_fill/check_corrupted_spill_fill +verifier_spill_fill/fill_32bit_after_spill_64bit_clear_id +verifier_spill_fill/spill_16bit_of_32bit_fail +verifier_spill_fill/spill_32bit_of_64bit_fail +verifier_spill_fill/u64_offset_to_skb_data +verifier_spill_fill/with_invalid_reg_offset_0 +verifier_spin_lock/call_within_a_locked_region +verifier_spin_lock/lock_test2_direct_ld_st +verifier_spin_lock/lock_test3_direct_ld_st +verifier_spin_lock/lock_test4_direct_ld_st +verifier_spin_lock/lock_test7_unlock_without_lock +verifier_spin_lock/reg_id_for_map_value +verifier_spin_lock/spin_lock_test6_missing_unlock +verifier_spin_lock/spin_lock_test8_double_lock +verifier_spin_lock/spin_lock_test9_different_lock +verifier_spin_lock/test11_ld_abs_under_lock +verifier_stack_ptr/load_bad_alignment_on_off +verifier_stack_ptr/load_bad_alignment_on_reg +verifier_stack_ptr/load_out_of_bounds_high +verifier_stack_ptr/load_out_of_bounds_low +verifier_stack_ptr/to_stack_check_high_4 +verifier_stack_ptr/to_stack_check_high_5 +verifier_stack_ptr/to_stack_check_high_6 +verifier_stack_ptr/to_stack_check_high_7 +verifier_stack_ptr/to_stack_check_low_3 +verifier_stack_ptr/to_stack_check_low_4 +verifier_stack_ptr/to_stack_check_low_5 +verifier_stack_ptr/to_stack_check_low_6 +verifier_stack_ptr/to_stack_check_low_7 +verifier_subprog_precision/callback_precise_return_fail +verifier_tailcall_jit +verifier_uninit +verifier_unpriv +verifier_unpriv_perf +verifier_value/store_of_cleared_call_register +verifier_value_illegal_alu +verifier_value_or_null/map_access_from_else_condition +verifier_value_or_null/map_value_or_null_1 +verifier_value_or_null/map_value_or_null_2 +verifier_value_or_null/map_value_or_null_3 +verifier_value_or_null/multiple_map_lookup_elem_calls +verifier_value_or_null/null_check_ids_in_regsafe +verifier_value_ptr_arith/access_known_scalar_value_ptr_2 +verifier_value_ptr_arith/access_unknown_scalar_value_ptr +verifier_value_ptr_arith/access_value_ptr_known_scalar +verifier_value_ptr_arith/access_value_ptr_unknown_scalar +verifier_value_ptr_arith/access_value_ptr_value_ptr_1 +verifier_value_ptr_arith/access_value_ptr_value_ptr_2 +verifier_value_ptr_arith/lower_oob_arith_test_1 +verifier_value_ptr_arith/to_leak_tainted_dst_reg +verifier_value_ptr_arith/unknown_scalar_value_ptr_4 +verifier_value_ptr_arith/value_ptr_known_scalar_2_1 +verifier_value_ptr_arith/value_ptr_known_scalar_3 +verifier_var_off/access_max_out_of_bound +verifier_var_off/access_min_out_of_bound +verifier_var_off/stack_write_clobbers_spilled_regs +verifier_var_off/variable_offset_ctx_access +verifier_var_off/variable_offset_stack_access_unbounded +verifier_var_off/zero_sized_access_max_out_of_bound +verifier_vfs_reject +verifier_xadd/xadd_w_check_unaligned_map +verifier_xadd/xadd_w_check_unaligned_pkt +verifier_xadd/xadd_w_check_unaligned_stack +verifier_xdp_direct_packet_access/corner_case_1_bad_access_1 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_10 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_11 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_12 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_13 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_14 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_15 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_16 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_2 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_3 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_4 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_5 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_6 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_7 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_8 +verifier_xdp_direct_packet_access/corner_case_1_bad_access_9 +verifier_xdp_direct_packet_access/end_mangling_bad_access_1 +verifier_xdp_direct_packet_access/end_mangling_bad_access_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_1_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_5 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_6 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_7 +verifier_xdp_direct_packet_access/pkt_data_bad_access_2_8 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_end_bad_access_2_4 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_1_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_1 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_2 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_3 +verifier_xdp_direct_packet_access/pkt_meta_bad_access_2_4 +verify_pkcs7_sig +xdp_synproxy diff --git a/ci/vmtest/configs/DENYLIST.test_progs_cpuv4 b/ci/vmtest/configs/DENYLIST.test_progs_cpuv4 new file mode 100644 index 0000000000000..0c02eae8f5cd1 --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.test_progs_cpuv4 @@ -0,0 +1 @@ +verifier_arena/basic_alloc2 diff --git a/ci/vmtest/configs/DENYLIST.x86_64 b/ci/vmtest/configs/DENYLIST.x86_64 new file mode 100644 index 0000000000000..6fc3413daab9f --- /dev/null +++ b/ci/vmtest/configs/DENYLIST.x86_64 @@ -0,0 +1 @@ +netcnt # with kvm enabled, fail with packets unexpected packets: actual 10001 != expected 10000 diff --git a/ci/vmtest/configs/run-vmtest.env b/ci/vmtest/configs/run-vmtest.env new file mode 100644 index 0000000000000..c60f1db6673c7 --- /dev/null +++ b/ci/vmtest/configs/run-vmtest.env @@ -0,0 +1,42 @@ +#!/bin/bash + +# This file is sourced by libbpf/ci/run-vmtest Github Action scripts. +# +# The primary reason it exists is that assembling ALLOWLIST and +# DENYLIST for a particular test run is not a trivial operation. +# +# Users of libbpf/ci/run-vmtest action need to be able to specify a +# list of allow/denylist **files**, that later has to be correctly +# merged into a single allow/denylist passed to a test runner. +# +# Obviously it's perferrable for the scripts merging many lists into +# one to be reusable, and not copy-pasted between repositories which +# use libbpf/ci actions. And specifying the lists should be trivial. +# This file is a solution to that. + +# $SELFTESTS_BPF and $VMTEST_CONFIGS are set in the workflow, before +# libbpf/ci/run-vmtest action is called +# See .github/workflows/kernel-test.yml + +ALLOWLIST_FILES=( + "${SELFTESTS_BPF}/ALLOWLIST" + "${SELFTESTS_BPF}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST" + "${VMTEST_CONFIGS}/ALLOWLIST.${ARCH}" + "${VMTEST_CONFIGS}/ALLOWLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/ALLOWLIST.${KERNEL_TEST}" +) + +DENYLIST_FILES=( + "${SELFTESTS_BPF}/DENYLIST" + "${SELFTESTS_BPF}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST" + "${VMTEST_CONFIGS}/DENYLIST.${ARCH}" + "${VMTEST_CONFIGS}/DENYLIST.${DEPLOYMENT}" + "${VMTEST_CONFIGS}/DENYLIST.${KERNEL_TEST}" +) + +# Export pipe-separated strings, because bash doesn't support array export +export SELFTESTS_BPF_ALLOWLIST_FILES=$(IFS="|"; echo "${ALLOWLIST_FILES[*]}") +export SELFTESTS_BPF_DENYLIST_FILES=$(IFS="|"; echo "${DENYLIST_FILES[*]}") + diff --git a/ci/vmtest/configs/run_veristat.kernel.cfg b/ci/vmtest/configs/run_veristat.kernel.cfg new file mode 100644 index 0000000000000..807efc251073f --- /dev/null +++ b/ci/vmtest/configs/run_veristat.kernel.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${SELFTESTS_BPF}" +VERISTAT_OBJECTS_GLOB="*.bpf.o" +VERISTAT_CFG_FILE="${SELFTESTS_BPF}/veristat.cfg" +VERISTAT_OUTPUT="veristat-kernel" diff --git a/ci/vmtest/configs/run_veristat.meta.cfg b/ci/vmtest/configs/run_veristat.meta.cfg new file mode 100644 index 0000000000000..14f08d241d206 --- /dev/null +++ b/ci/vmtest/configs/run_veristat.meta.cfg @@ -0,0 +1,4 @@ +VERISTAT_OBJECTS_DIR="${WORKING_DIR}/bpf_objects" +VERISTAT_OBJECTS_GLOB="*.o" +VERISTAT_OUTPUT="veristat-meta" +VERISTAT_CFG_FILE="${VERISTAT_CONFIGS}/veristat_meta.cfg" diff --git a/ci/vmtest/configs/run_veristat.scx.cfg b/ci/vmtest/configs/run_veristat.scx.cfg new file mode 100644 index 0000000000000..740cf8e960b32 --- /dev/null +++ b/ci/vmtest/configs/run_veristat.scx.cfg @@ -0,0 +1,3 @@ +VERISTAT_OBJECTS_DIR="${SCX_PROGS}" +VERISTAT_OBJECTS_GLOB="*.bpf.o" +VERISTAT_OUTPUT="veristat-scx" diff --git a/ci/vmtest/configs/veristat_meta.cfg b/ci/vmtest/configs/veristat_meta.cfg new file mode 100644 index 0000000000000..a8c25d71cb9e2 --- /dev/null +++ b/ci/vmtest/configs/veristat_meta.cfg @@ -0,0 +1,10 @@ +# List of exceptions we know about that are not going to work with veristat. + +# needs 'migrate_misplaced_page' which went away in +# commit 73eab3ca481e ("mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio()") +!numamove_bpf-numamove_bpf.o + +# use non-libbpf loader +!takeover_bpf_lib-takeover.bpf.o +!tcp_tuner_bpf_lib-tcptuner.bpf.o + diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c index 5e2b2680d7dbf..9e4bb7fbde25e 100644 --- a/crypto/async_tx/async_pq.c +++ b/crypto/async_tx/async_pq.c @@ -119,7 +119,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int *offsets, int disks, for (i = 0; i < disks; i++) { if (blocks[i] == NULL) { BUG_ON(i > disks - 3); /* P or Q can't be zero */ - srcs[i] = (void*)raid6_empty_zero_page; + srcs[i] = raid6_get_zero_page(); } else { srcs[i] = page_address(blocks[i]) + offsets[i]; diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c index 354b8cd5537f8..539ea5b378dcd 100644 --- a/crypto/async_tx/async_raid6_recov.c +++ b/crypto/async_tx/async_raid6_recov.c @@ -414,7 +414,7 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) if (blocks[i] == NULL) - ptrs[i] = (void *) raid6_empty_zero_page; + ptrs[i] = raid6_get_zero_page(); else ptrs[i] = page_address(blocks[i]) + offs[i]; @@ -497,7 +497,7 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila, async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) if (blocks[i] == NULL) - ptrs[i] = (void*)raid6_empty_zero_page; + ptrs[i] = raid6_get_zero_page(); else ptrs[i] = page_address(blocks[i]) + offs[i]; diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index db8c5c03d3a25..16baa038d4125 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -286,7 +286,7 @@ config EFI_SBAT config EFI_SBAT_FILE string "Embedded SBAT section file path" - depends on EFI_ZBOOT + depends on EFI_ZBOOT || (EFI_STUB && X86) help SBAT section provides a way to improve SecureBoot revocations of UEFI binaries by introducing a generation-based mechanism. With SBAT, older diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c index e8a04e476c571..09a64f224c491 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c @@ -220,8 +220,7 @@ static int guc_action_control_log(struct intel_guc *guc, bool enable, */ static int subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding) + void *prev_subbuf) { /* * Use no-overwrite mode by default, where relay will stop accepting diff --git a/drivers/irqchip/irq-imgpdc.c b/drivers/irqchip/irq-imgpdc.c index f0410d5d7315d..e9ef2f5a7207e 100644 --- a/drivers/irqchip/irq-imgpdc.c +++ b/drivers/irqchip/irq-imgpdc.c @@ -372,8 +372,8 @@ static int pdc_intc_probe(struct platform_device *pdev) priv->syswake_irq = irq; /* Set up an IRQ domain */ - priv->domain = irq_domain_create_linear(of_fwnode_handle(node), 16, &irq_generic_chip_ops, - priv); + priv->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), 16, &irq_generic_chip_ops, + priv); if (unlikely(!priv->domain)) { dev_err(&pdev->dev, "cannot add IRQ domain\n"); return -ENOMEM; diff --git a/drivers/irqchip/irq-imx-irqsteer.c b/drivers/irqchip/irq-imx-irqsteer.c index 6dc9ac48fee55..4682ce5bf8d3c 100644 --- a/drivers/irqchip/irq-imx-irqsteer.c +++ b/drivers/irqchip/irq-imx-irqsteer.c @@ -212,8 +212,8 @@ static int imx_irqsteer_probe(struct platform_device *pdev) /* steer all IRQs into configured channel */ writel_relaxed(BIT(data->channel), data->regs + CHANCTRL); - data->domain = irq_domain_create_linear(of_fwnode_handle(np), data->reg_num * 32, - &imx_irqsteer_domain_ops, data); + data->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), data->reg_num * 32, + &imx_irqsteer_domain_ops, data); if (!data->domain) { dev_err(&pdev->dev, "failed to create IRQ domain\n"); ret = -ENOMEM; diff --git a/drivers/irqchip/irq-keystone.c b/drivers/irqchip/irq-keystone.c index c9e902b7bf488..922fff09354f9 100644 --- a/drivers/irqchip/irq-keystone.c +++ b/drivers/irqchip/irq-keystone.c @@ -157,8 +157,8 @@ static int keystone_irq_probe(struct platform_device *pdev) kirq->chip.irq_mask = keystone_irq_setmask; kirq->chip.irq_unmask = keystone_irq_unmask; - kirq->irqd = irq_domain_create_linear(of_fwnode_handle(np), KEYSTONE_N_IRQ, - &keystone_irq_ops, kirq); + kirq->irqd = irq_domain_create_linear(dev_fwnode(dev), KEYSTONE_N_IRQ, &keystone_irq_ops, + kirq); if (!kirq->irqd) { dev_err(dev, "IRQ domain registration failed\n"); return -ENODEV; diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 34e8d09c12a0b..19a57c5e2b2ea 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -375,9 +375,13 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask, /* * The GIC specifies that we can only route an interrupt to one VP(E), * ie. CPU in Linux parlance, at a time. Therefore we always route to - * the first online CPU in the mask. + * the first forced or online CPU in the mask. */ - cpu = cpumask_first_and(cpumask, cpu_online_mask); + if (force) + cpu = cpumask_first(cpumask); + else + cpu = cpumask_first_and(cpumask, cpu_online_mask); + if (cpu >= NR_CPUS) return -EINVAL; diff --git a/drivers/irqchip/irq-mvebu-pic.c b/drivers/irqchip/irq-mvebu-pic.c index 8db638aa21d24..cd8b73482b9f5 100644 --- a/drivers/irqchip/irq-mvebu-pic.c +++ b/drivers/irqchip/irq-mvebu-pic.c @@ -150,7 +150,7 @@ static int mvebu_pic_probe(struct platform_device *pdev) return -EINVAL; } - pic->domain = irq_domain_create_linear(of_fwnode_handle(node), PIC_MAX_IRQS, + pic->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), PIC_MAX_IRQS, &mvebu_pic_domain_ops, pic); if (!pic->domain) { dev_err(&pdev->dev, "Failed to allocate irq domain\n"); diff --git a/drivers/irqchip/irq-pruss-intc.c b/drivers/irqchip/irq-pruss-intc.c index 87a5813fd8359..81078d56f38dc 100644 --- a/drivers/irqchip/irq-pruss-intc.c +++ b/drivers/irqchip/irq-pruss-intc.c @@ -555,7 +555,7 @@ static int pruss_intc_probe(struct platform_device *pdev) mutex_init(&intc->lock); - intc->domain = irq_domain_create_linear(of_fwnode_handle(dev->of_node), max_system_events, + intc->domain = irq_domain_create_linear(dev_fwnode(dev), max_system_events, &pruss_intc_irq_domain_ops, intc); if (!intc->domain) return -ENOMEM; diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c index 0959ed43b1a9a..117b74b635ea8 100644 --- a/drivers/irqchip/irq-renesas-intc-irqpin.c +++ b/drivers/irqchip/irq-renesas-intc-irqpin.c @@ -513,10 +513,8 @@ static int intc_irqpin_probe(struct platform_device *pdev) irq_chip->irq_set_wake = intc_irqpin_irq_set_wake; irq_chip->flags = IRQCHIP_MASK_ON_SUSPEND; - p->irq_domain = irq_domain_create_simple(of_fwnode_handle(dev->of_node), - nirqs, 0, - &intc_irqpin_irq_domain_ops, - p); + p->irq_domain = irq_domain_create_simple(dev_fwnode(dev), nirqs, 0, + &intc_irqpin_irq_domain_ops, p); if (!p->irq_domain) { ret = -ENXIO; dev_err(dev, "cannot initialize irq domain\n"); diff --git a/drivers/irqchip/irq-renesas-irqc.c b/drivers/irqchip/irq-renesas-irqc.c index 5c3196e5a4374..b46bbb66c2641 100644 --- a/drivers/irqchip/irq-renesas-irqc.c +++ b/drivers/irqchip/irq-renesas-irqc.c @@ -168,7 +168,7 @@ static int irqc_probe(struct platform_device *pdev) p->cpu_int_base = p->iomem + IRQC_INT_CPU_BASE(0); /* SYS-SPI */ - p->irq_domain = irq_domain_create_linear(of_fwnode_handle(dev->of_node), p->number_of_irqs, + p->irq_domain = irq_domain_create_linear(dev_fwnode(dev), p->number_of_irqs, &irq_generic_chip_ops, p); if (!p->irq_domain) { ret = -ENXIO; diff --git a/drivers/irqchip/irq-renesas-rza1.c b/drivers/irqchip/irq-renesas-rza1.c index 0a9640ba0adb4..a697eb55ac90e 100644 --- a/drivers/irqchip/irq-renesas-rza1.c +++ b/drivers/irqchip/irq-renesas-rza1.c @@ -231,9 +231,8 @@ static int rza1_irqc_probe(struct platform_device *pdev) priv->chip.irq_set_type = rza1_irqc_set_type; priv->chip.flags = IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_SKIP_SET_WAKE; - priv->irq_domain = irq_domain_create_hierarchy(parent, 0, IRQC_NUM_IRQ, - of_fwnode_handle(np), &rza1_irqc_domain_ops, - priv); + priv->irq_domain = irq_domain_create_hierarchy(parent, 0, IRQC_NUM_IRQ, dev_fwnode(dev), + &rza1_irqc_domain_ops, priv); if (!priv->irq_domain) { dev_err(dev, "cannot initialize irq domain\n"); ret = -ENOMEM; diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 1e861bd64f970..360d88687e4f5 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -574,9 +574,8 @@ static int rzg2l_irqc_common_init(struct device_node *node, struct device_node * raw_spin_lock_init(&rzg2l_irqc_data->lock); - irq_domain = irq_domain_create_hierarchy(parent_domain, 0, IRQC_NUM_IRQ, - of_fwnode_handle(node), &rzg2l_irqc_domain_ops, - rzg2l_irqc_data); + irq_domain = irq_domain_create_hierarchy(parent_domain, 0, IRQC_NUM_IRQ, dev_fwnode(dev), + &rzg2l_irqc_domain_ops, rzg2l_irqc_data); if (!irq_domain) { pm_runtime_put(dev); return dev_err_probe(dev, -ENOMEM, "failed to add irq domain\n"); diff --git a/drivers/irqchip/irq-renesas-rzv2h.c b/drivers/irqchip/irq-renesas-rzv2h.c index 69b32c19e8ff6..57c5a3c008c91 100644 --- a/drivers/irqchip/irq-renesas-rzv2h.c +++ b/drivers/irqchip/irq-renesas-rzv2h.c @@ -558,7 +558,7 @@ static int rzv2h_icu_init_common(struct device_node *node, struct device_node *p raw_spin_lock_init(&rzv2h_icu_data->lock); irq_domain = irq_domain_create_hierarchy(parent_domain, 0, ICU_NUM_IRQ, - of_fwnode_handle(node), &rzv2h_icu_domain_ops, + dev_fwnode(&pdev->dev), &rzv2h_icu_domain_ops, rzv2h_icu_data); if (!irq_domain) { dev_err(&pdev->dev, "failed to add irq domain\n"); diff --git a/drivers/irqchip/irq-stm32mp-exti.c b/drivers/irqchip/irq-stm32mp-exti.c index c6b4407d05f9f..a24f4f1a4f8fd 100644 --- a/drivers/irqchip/irq-stm32mp-exti.c +++ b/drivers/irqchip/irq-stm32mp-exti.c @@ -683,9 +683,7 @@ static int stm32mp_exti_probe(struct platform_device *pdev) } domain = irq_domain_create_hierarchy(parent_domain, 0, drv_data->bank_nr * IRQS_PER_BANK, - of_fwnode_handle(np), &stm32mp_exti_domain_ops, - host_data); - + dev_fwnode(dev), &stm32mp_exti_domain_ops, host_data); if (!domain) { dev_err(dev, "Could not register exti domain\n"); return -ENOMEM; diff --git a/drivers/irqchip/irq-ti-sci-inta.c b/drivers/irqchip/irq-ti-sci-inta.c index 7de59238e6b0b..01963d36cfaf3 100644 --- a/drivers/irqchip/irq-ti-sci-inta.c +++ b/drivers/irqchip/irq-ti-sci-inta.c @@ -701,8 +701,7 @@ static int ti_sci_inta_irq_domain_probe(struct platform_device *pdev) if (ret) return ret; - domain = irq_domain_create_linear(of_fwnode_handle(dev_of_node(dev)), - ti_sci_get_num_resources(inta->vint), + domain = irq_domain_create_linear(dev_fwnode(dev), ti_sci_get_num_resources(inta->vint), &ti_sci_inta_irq_domain_ops, inta); if (!domain) { dev_err(dev, "Failed to allocate IRQ domain\n"); diff --git a/drivers/irqchip/irq-ti-sci-intr.c b/drivers/irqchip/irq-ti-sci-intr.c index 07fff5ae5ce09..354613e74ad0b 100644 --- a/drivers/irqchip/irq-ti-sci-intr.c +++ b/drivers/irqchip/irq-ti-sci-intr.c @@ -274,8 +274,7 @@ static int ti_sci_intr_irq_domain_probe(struct platform_device *pdev) return PTR_ERR(intr->out_irqs); } - domain = irq_domain_create_hierarchy(parent_domain, 0, 0, - of_fwnode_handle(dev_of_node(dev)), + domain = irq_domain_create_hierarchy(parent_domain, 0, 0, dev_fwnode(dev), &ti_sci_intr_irq_domain_ops, intr); if (!domain) { dev_err(dev, "Failed to allocate IRQ domain\n"); diff --git a/drivers/irqchip/irq-ts4800.c b/drivers/irqchip/irq-ts4800.c index e625f4fb2bb85..1e236d5b75162 100644 --- a/drivers/irqchip/irq-ts4800.c +++ b/drivers/irqchip/irq-ts4800.c @@ -125,7 +125,7 @@ static int ts4800_ic_probe(struct platform_device *pdev) return -EINVAL; } - data->domain = irq_domain_create_linear(of_fwnode_handle(node), 8, &ts4800_ic_ops, data); + data->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), 8, &ts4800_ic_ops, data); if (!data->domain) { dev_err(&pdev->dev, "cannot add IRQ domain\n"); return -ENOMEM; diff --git a/drivers/net/wwan/iosm/iosm_ipc_trace.c b/drivers/net/wwan/iosm/iosm_ipc_trace.c index eeecfa3d10c5a..9656254c1c6c3 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_trace.c +++ b/drivers/net/wwan/iosm/iosm_ipc_trace.c @@ -51,8 +51,7 @@ static int ipc_trace_remove_buf_file_handler(struct dentry *dentry) } static int ipc_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding) + void *prev_subbuf) { if (relay_buf_full(buf)) { pr_err_ratelimited("Relay_buf full dropping traces"); diff --git a/drivers/net/wwan/t7xx/t7xx_port_trace.c b/drivers/net/wwan/t7xx/t7xx_port_trace.c index 4ed8b4e29bf1d..f16d3b01302cd 100644 --- a/drivers/net/wwan/t7xx/t7xx_port_trace.c +++ b/drivers/net/wwan/t7xx/t7xx_port_trace.c @@ -33,7 +33,7 @@ static int t7xx_trace_remove_buf_file_handler(struct dentry *dentry) } static int t7xx_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { if (relay_buf_full(buf)) { pr_err_ratelimited("Relay_buf full dropping traces"); diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 6ede55a7c5e65..78bed2def9d87 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -113,7 +113,8 @@ static int pci_setup_msi_context(struct pci_dev *dev) void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set) { - raw_spinlock_t *lock = &to_pci_dev(desc->dev)->msi_lock; + struct pci_dev *dev = msi_desc_to_pci_dev(desc); + raw_spinlock_t *lock = &dev->msi_lock; unsigned long flags; if (!desc->pci.msi_attrib.can_mask) @@ -122,8 +123,7 @@ void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set) raw_spin_lock_irqsave(lock, flags); desc->pci.msi_mask &= ~clear; desc->pci.msi_mask |= set; - pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, - desc->pci.msi_mask); + pci_write_config_dword(dev, desc->pci.mask_pos, desc->pci.msi_mask); raw_spin_unlock_irqrestore(lock, flags); } diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index 7a4e2188f1091..d2b3ae7113ab5 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -101,7 +101,8 @@ static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_io req.resp_sz = resp_len; req.exit_code = SVM_VMGEXIT_GUEST_REQUEST; - rc = snp_send_guest_request(mdesc, &req, arg); + rc = snp_send_guest_request(mdesc, &req); + arg->exitinfo2 = req.exitinfo2; if (rc) goto e_free; @@ -152,7 +153,8 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque req.resp_sz = resp_len; req.exit_code = SVM_VMGEXIT_GUEST_REQUEST; - rc = snp_send_guest_request(mdesc, &req, arg); + rc = snp_send_guest_request(mdesc, &req); + arg->exitinfo2 = req.exitinfo2; if (rc) return rc; @@ -249,7 +251,8 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques req.resp_sz = resp_len; req.exit_code = SVM_VMGEXIT_EXT_GUEST_REQUEST; - ret = snp_send_guest_request(mdesc, &req, arg); + ret = snp_send_guest_request(mdesc, &req); + arg->exitinfo2 = req.exitinfo2; /* If certs length is invalid then copy the returned length */ if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) { diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 1db348f8f887a..a7061c2ad8e4d 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -356,7 +356,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); - fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); + fat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bfe8d8af46f3c..9572bdef49eec 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -9,6 +9,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" +#include #include #include #include @@ -162,6 +163,9 @@ static void fuse_evict_inode(struct inode *inode) /* Will write inode on close/munmap and in all other dirtiers */ WARN_ON(inode->i_state & I_DIRTY_INODE); + if (FUSE_IS_DAX(inode)) + dax_break_layout_final(inode); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 40b6bce12951e..89aadc6cdd877 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1071,6 +1071,7 @@ static int ocfs2_grab_folios_for_write(struct address_space *mapping, if (IS_ERR(wc->w_folios[i])) { ret = PTR_ERR(wc->w_folios[i]); mlog_errno(ret); + wc->w_folios[i] = NULL; goto out; } } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 12e5d1f733256..14bf440ea4dff 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -50,8 +50,6 @@ struct ocfs2_find_inode_args unsigned int fi_sysfile_type; }; -static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; - static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_find_inode_args *args); static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); @@ -250,14 +248,77 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque) static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) { struct ocfs2_find_inode_args *args = opaque; +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, ocfs2_file_ip_alloc_sem_key; +#endif inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; - if (args->fi_sysfile_type != 0) +#ifdef CONFIG_LOCKDEP + switch (args->fi_sysfile_type) { + case BAD_BLOCK_SYSTEM_INODE: + break; + case GLOBAL_INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]); + break; + case SLOT_MAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]); + break; + case HEARTBEAT_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]); + break; + case GLOBAL_BITMAP_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]); + break; + case USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]); + break; + case GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]); + break; + case ORPHAN_DIR_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]); + break; + case EXTENT_ALLOC_SYSTEM_INODE: lockdep_set_class(&inode->i_rwsem, - &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); + &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]); + break; + case INODE_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]); + break; + case JOURNAL_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]); + break; + case LOCAL_ALLOC_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]); + break; + case TRUNCATE_LOG_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]); + break; + case LOCAL_USER_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]); + break; + case LOCAL_GROUP_QUOTA_SYSTEM_INODE: + lockdep_set_class(&inode->i_rwsem, + &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]); + break; + default: + WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type); + } if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || @@ -267,6 +328,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) else lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, &ocfs2_file_ip_alloc_sem_key); +#endif return 0; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 99278c8f0e242..26bc59c3a8132 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1452,8 +1452,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, newfe = (struct ocfs2_dinode *) newfe_bh->b_data; trace_ocfs2_rename_over_existing( - (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? - (unsigned long long)newfe_bh->b_blocknr : 0ULL); + (unsigned long long)newfe_blkno, newfe_bh, + (unsigned long long)newfe_bh->b_blocknr); if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 6aaa94c554c12..8bdeea60742af 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -494,8 +494,6 @@ struct ocfs2_super struct rb_root osb_rf_lock_tree; struct ocfs2_refcount_tree *osb_ref_tree_lru; - struct mutex system_file_mutex; - /* * OCFS2 needs to schedule several different types of work which * require cluster locking, disk I/O, recovery waits, etc. Since these diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 77edcd70f72c2..0f045e45fa0c3 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -360,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file, struct ocfs2_control_message_setn *msg) { long nodenum; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; if (ocfs2_control_get_handshake_state(file) != @@ -375,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file, return -EINVAL; msg->space = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || @@ -391,7 +389,6 @@ static int ocfs2_control_do_setversion_msg(struct file *file, struct ocfs2_control_message_setv *msg) { long major, minor; - char *ptr = NULL; struct ocfs2_control_private *p = file->private_data; struct ocfs2_protocol_version *max = &ocfs2_user_plugin.sp_max_proto; @@ -409,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - major = simple_strtol(msg->major, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->major, 16, &major)) return -EINVAL; - minor = simple_strtol(msg->minor, &ptr, 16); - if (!ptr || *ptr) + if (kstrtol(msg->minor, 16, &minor)) return -EINVAL; /* @@ -441,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file, struct ocfs2_control_message_down *msg) { long nodenum; - char *p = NULL; if (ocfs2_control_get_handshake_state(file) != OCFS2_CONTROL_HANDSHAKE_VALID) @@ -456,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file, return -EINVAL; msg->space1 = msg->space2 = msg->newline = '\0'; - nodenum = simple_strtol(msg->nodestr, &p, 16); - if (!p || *p) + if (kstrtol(msg->nodestr, 16, &nodenum)) return -EINVAL; if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3d2533950bae2..4461daf909cfd 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1997,8 +1997,6 @@ static int ocfs2_initialize_super(struct super_block *sb, spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_steal_slots(osb); - mutex_init(&osb->system_file_mutex); - atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); atomic_set(&osb->alloc_stats.bitmap_data, 0); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 53a945da873bc..b63af8d64904a 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -98,11 +98,9 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, } else arr = get_local_system_inode(osb, type, slot); - mutex_lock(&osb->system_file_mutex); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); - mutex_unlock(&osb->system_file_mutex); BUG_ON(!inode); return inode; @@ -112,11 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, inode = _ocfs2_get_system_file_inode(osb, type, slot); /* add one more if putting into array for first time */ - if (arr && inode) { - *arr = igrab(inode); - BUG_ON(!*arr); + if (inode && arr && !*arr && !cmpxchg(&(*arr), NULL, inode)) { + inode = igrab(inode); + BUG_ON(!inode); } - mutex_unlock(&osb->system_file_mutex); return inode; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 27972c0749e78..4be91eb6ea5ca 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2182,7 +2182,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_FILE; } - if (is_zero_pfn(pmd_pfn(pmd))) + if (is_huge_zero_pmd(pmd)) categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 3061043e915c8..296c5a0fcc406 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -80,23 +80,22 @@ static int squashfs_bio_read_cached(struct bio *fullbio, struct address_space *cache_mapping, u64 index, int length, u64 read_start, u64 read_end, int page_count) { - struct page *head_to_cache = NULL, *tail_to_cache = NULL; + struct folio *head_to_cache = NULL, *tail_to_cache = NULL; struct block_device *bdev = fullbio->bi_bdev; int start_idx = 0, end_idx = 0; - struct bvec_iter_all iter_all; + struct folio_iter fi;; struct bio *bio = NULL; - struct bio_vec *bv; int idx = 0; int err = 0; #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL - struct page **cache_pages = kmalloc_array(page_count, + struct folio **cache_folios = kmalloc_array(page_count, sizeof(void *), GFP_KERNEL | __GFP_ZERO); #endif - bio_for_each_segment_all(bv, fullbio, iter_all) { - struct page *page = bv->bv_page; + bio_for_each_folio_all(fi, fullbio) { + struct folio *folio = fi.folio; - if (page->mapping == cache_mapping) { + if (folio->mapping == cache_mapping) { idx++; continue; } @@ -111,13 +110,13 @@ static int squashfs_bio_read_cached(struct bio *fullbio, * adjacent blocks. */ if (idx == 0 && index != read_start) - head_to_cache = page; + head_to_cache = folio; else if (idx == page_count - 1 && index + length != read_end) - tail_to_cache = page; + tail_to_cache = folio; #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL /* Cache all pages in the BIO for repeated reads */ - else if (cache_pages) - cache_pages[idx] = page; + else if (cache_folios) + cache_folios[idx] = folio; #endif if (!bio || idx != end_idx) { @@ -150,45 +149,45 @@ static int squashfs_bio_read_cached(struct bio *fullbio, return err; if (head_to_cache) { - int ret = add_to_page_cache_lru(head_to_cache, cache_mapping, + int ret = filemap_add_folio(cache_mapping, head_to_cache, read_start >> PAGE_SHIFT, GFP_NOIO); if (!ret) { - SetPageUptodate(head_to_cache); - unlock_page(head_to_cache); + folio_mark_uptodate(head_to_cache); + folio_unlock(head_to_cache); } } if (tail_to_cache) { - int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping, + int ret = filemap_add_folio(cache_mapping, tail_to_cache, (read_end >> PAGE_SHIFT) - 1, GFP_NOIO); if (!ret) { - SetPageUptodate(tail_to_cache); - unlock_page(tail_to_cache); + folio_mark_uptodate(tail_to_cache); + folio_unlock(tail_to_cache); } } #ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL - if (!cache_pages) + if (!cache_folios) goto out; for (idx = 0; idx < page_count; idx++) { - if (!cache_pages[idx]) + if (!cache_folios[idx]) continue; - int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping, + int ret = filemap_add_folio(cache_mapping, cache_folios[idx], (read_start >> PAGE_SHIFT) + idx, GFP_NOIO); if (!ret) { - SetPageUptodate(cache_pages[idx]); - unlock_page(cache_pages[idx]); + folio_mark_uptodate(cache_folios[idx]); + folio_unlock(cache_folios[idx]); } } - kfree(cache_pages); + kfree(cache_folios); out: #endif return 0; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 5ca2baa16dc25..ce7d661d5ad86 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -493,10 +493,9 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) return res; } -static int squashfs_readahead_fragment(struct page **page, +static int squashfs_readahead_fragment(struct inode *inode, struct page **page, unsigned int pages, unsigned int expected, loff_t start) { - struct inode *inode = page[0]->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); @@ -605,8 +604,8 @@ static void squashfs_readahead(struct readahead_control *ractl) if (start >> msblk->block_log == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { - res = squashfs_readahead_fragment(pages, nr_pages, - expected, start); + res = squashfs_readahead_fragment(inode, pages, + nr_pages, expected, start); if (res) goto skip_pages; continue; diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 387720933973b..2d9f61346dab8 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -17,6 +17,12 @@ #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif +#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED +# define WARN_CONDITION_STR(cond_str) cond_str +#else +# define WARN_CONDITION_STR(cond_str) +#endif + #ifndef __ASSEMBLY__ #include #include @@ -100,17 +106,18 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); instrumentation_end(); \ } while (0) #else -#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) +#define __WARN() __WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN)) #define __WARN_printf(taint, arg...) do { \ instrumentation_begin(); \ __warn_printk(arg); \ - __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ + __WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ instrumentation_end(); \ } while (0) #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS(BUGFLAG_ONCE | \ + __WARN_FLAGS("["#condition"] ", \ + BUGFLAG_ONCE | \ BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ }) diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 1fe7e7d1b214b..7b44b41d0a20d 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -13,10 +13,23 @@ */ extern struct resource crashk_res; extern struct resource crashk_low_res; +extern struct range crashk_cma_ranges[]; +#if defined(CONFIG_CMA) && defined(CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION) +#define CRASHKERNEL_CMA +#define CRASHKERNEL_CMA_RANGES_MAX 4 +extern int crashk_cma_cnt; +#else +#define crashk_cma_cnt 0 +#define CRASHKERNEL_CMA_RANGES_MAX 0 +#endif + int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - unsigned long long *low_size, bool *high); + unsigned long long *low_size, unsigned long long *cma_size, + bool *high); + +void __init reserve_crashkernel_cma(unsigned long long cma_size); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE diff --git a/include/linux/futex.h b/include/linux/futex.h index 005b040c4791b..b37193653e6b5 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -89,6 +89,7 @@ void futex_hash_free(struct mm_struct *mm); static inline void futex_mm_init(struct mm_struct *mm) { RCU_INIT_POINTER(mm->futex_phash, NULL); + mm->futex_phash_new = NULL; mutex_init(&mm->futex_hash_lock); } diff --git a/include/linux/gcd.h b/include/linux/gcd.h index cb572677fd7fb..616e81a7f7e31 100644 --- a/include/linux/gcd.h +++ b/include/linux/gcd.h @@ -3,6 +3,9 @@ #define _GCD_H #include +#include + +DECLARE_STATIC_KEY_TRUE(efficient_ffs_key); unsigned long gcd(unsigned long a, unsigned long b) __attribute_const__; diff --git a/include/linux/jhash.h b/include/linux/jhash.h index fa26a2dd3b52d..7c1c1821c694f 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -24,7 +24,7 @@ * Jozsef */ #include -#include +#include /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) @@ -77,9 +77,9 @@ static inline u32 jhash(const void *key, u32 length, u32 initval) /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { - a += __get_unaligned_cpu32(k); - b += __get_unaligned_cpu32(k + 4); - c += __get_unaligned_cpu32(k + 8); + a += get_unaligned((u32 *)k); + b += get_unaligned((u32 *)(k + 4)); + c += get_unaligned((u32 *)(k + 8)); __jhash_mix(a, b, c); length -= 12; k += 12; diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 93a73c076d169..fbd424b2abb14 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -28,6 +28,7 @@ extern void kmemleak_update_trace(const void *ptr) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_transient_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; +extern void kmemleak_ignore_percpu(const void __percpu *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, @@ -97,6 +98,9 @@ static inline void kmemleak_not_leak(const void *ptr) static inline void kmemleak_transient_leak(const void *ptr) { } +static inline void kmemleak_ignore_percpu(const void __percpu *ptr) +{ +} static inline void kmemleak_ignore(const void *ptr) { } diff --git a/include/linux/nmi.h b/include/linux/nmi.h index e78fa535f61dd..0a6d8a8d2d5b7 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -103,10 +103,12 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_config_perf_event(const char *str); +extern void hardlockup_detector_perf_adjust_period(int cpu, u64 period); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_config_perf_event(const char *str) { } +static inline void hardlockup_detector_perf_adjust_period(int cpu, u64 period) { } #endif void watchdog_hardlockup_stop(void); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b0af8d4ef6e66..1fad1c8a4c76a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -369,8 +369,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif -#ifdef CONFIG_SMP - /* * Migrate-Disable and why it is undesired. * @@ -429,13 +427,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, extern void migrate_disable(void); extern void migrate_enable(void); -#else - -static inline void migrate_disable(void) { } -static inline void migrate_enable(void) { } - -#endif /* CONFIG_SMP */ - /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section * diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 72ff44cca864b..2467b3be15c9e 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -11,8 +11,13 @@ #ifdef __KERNEL__ #include +#include -extern const char raid6_empty_zero_page[PAGE_SIZE]; +/* This should be const but the raid6 code is too convoluted for that. */ +static inline void *raid6_get_zero_page(void) +{ + return page_address(ZERO_PAGE(0)); +} #else /* ! __KERNEL__ */ /* Used for testing in user space */ @@ -191,6 +196,11 @@ static inline uint32_t raid6_jiffies(void) return tv.tv_sec*1000 + tv.tv_usec/1000; } +static inline void *raid6_get_zero_page(void) +{ + return raid6_empty_zero_page; +} + #endif /* ! __KERNEL__ */ #endif /* LINUX_RAID_RAID6_H */ diff --git a/include/linux/relay.h b/include/linux/relay.h index b3224111d074c..6772a70758401 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -28,6 +28,22 @@ */ #define RELAYFS_CHANNEL_VERSION 7 +/* + * Relay buffer statistics + */ +enum { + RELAY_STATS_BUF_FULL = (1 << 0), + RELAY_STATS_WRT_BIG = (1 << 1), + + RELAY_STATS_LAST = RELAY_STATS_WRT_BIG, +}; + +struct rchan_buf_stats +{ + unsigned int full_count; /* counter for buffer full */ + unsigned int big_count; /* counter for too big to write */ +}; + /* * Per-cpu relay channel buffer */ @@ -43,11 +59,11 @@ struct rchan_buf struct irq_work wakeup_work; /* reader wakeup */ struct dentry *dentry; /* channel file dentry */ struct kref kref; /* channel buffer refcount */ + struct rchan_buf_stats stats; /* buffer stats */ struct page **page_array; /* array of current buffer pages */ unsigned int page_count; /* number of current buffer pages */ unsigned int finalized; /* buffer has been finalized */ size_t *padding; /* padding counts per sub-buffer */ - size_t prev_padding; /* temporary variable */ size_t bytes_consumed; /* bytes consumed in cur read subbuf */ size_t early_bytes; /* bytes consumed before VFS inited */ unsigned int cpu; /* this buf's cpu */ @@ -65,7 +81,6 @@ struct rchan const struct rchan_callbacks *cb; /* client callbacks */ struct kref kref; /* channel refcount */ void *private_data; /* for user-defined data */ - size_t last_toobig; /* tried to log event > subbuf size */ struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */ int is_global; /* One global buffer ? */ struct list_head list; /* for channel list */ @@ -84,7 +99,6 @@ struct rchan_callbacks * @buf: the channel buffer containing the new sub-buffer * @subbuf: the start of the new sub-buffer * @prev_subbuf: the start of the previous sub-buffer - * @prev_padding: unused space at the end of previous sub-buffer * * The client should return 1 to continue logging, 0 to stop * logging. @@ -100,8 +114,7 @@ struct rchan_callbacks */ int (*subbuf_start) (struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, - size_t prev_padding); + void *prev_subbuf); /* * create_buf_file - create file to represent a relay channel buffer @@ -161,6 +174,7 @@ struct rchan *relay_open(const char *base_filename, void *private_data); extern void relay_close(struct rchan *chan); extern void relay_flush(struct rchan *chan); +size_t relay_stats(struct rchan *chan, int flags); extern void relay_subbufs_consumed(struct rchan *chan, unsigned int cpu, size_t consumed); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f78a64beb52c..eec6b225e9d14 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -395,15 +395,10 @@ enum uclamp_id { UCLAMP_CNT }; -#ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); -#else -static inline void sched_domains_mutex_lock(void) { } -static inline void sched_domains_mutex_unlock(void) { } -#endif struct sched_param { int sched_priority; @@ -604,7 +599,6 @@ struct sched_entity { unsigned long runnable_weight; #endif -#ifdef CONFIG_SMP /* * Per entity load average tracking. * @@ -612,7 +606,6 @@ struct sched_entity { * collide with read-mostly values above. */ struct sched_avg avg; -#endif }; struct sched_rt_entity { @@ -842,7 +835,6 @@ struct task_struct { struct alloc_tag *alloc_tag; #endif -#ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; @@ -858,7 +850,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; -#endif int on_rq; int prio; @@ -917,9 +908,7 @@ struct task_struct { cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; -#ifdef CONFIG_SMP unsigned short migration_disabled; -#endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU @@ -951,10 +940,8 @@ struct task_struct { struct sched_info sched_info; struct list_head tasks; -#ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; -#endif struct mm_struct *mm; struct mm_struct *active_mm; @@ -1778,12 +1765,8 @@ extern struct pid *cad_pid; static __always_inline bool is_percpu_thread(void) { -#ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); -#else - return true; -#endif } /* Per-process atomic flags. */ @@ -1848,7 +1831,6 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -#ifdef CONFIG_SMP /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); @@ -1866,33 +1848,6 @@ extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); -#else -static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -} -static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ - if ((*cpumask_bits(new_mask) & 1) == 0) - return -EINVAL; - return 0; -} -static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) -{ - if (src->user_cpus_ptr) - return -EINVAL; - return 0; -} -static inline void release_user_cpus_ptr(struct task_struct *p) -{ - WARN_ON(p->user_cpus_ptr); -} - -static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) -{ - return 0; -} -#endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); @@ -1981,11 +1936,7 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); -#ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); -#else -static inline void kick_process(struct task_struct *tsk) { } -#endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ @@ -2012,7 +1963,6 @@ extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec buf; \ }) -#ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* @@ -2022,9 +1972,6 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -#else -static inline void scheduler_ipi(void) { } -#endif extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); @@ -2230,7 +2177,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* @@ -2242,7 +2188,6 @@ static inline bool owner_on_cpu(struct task_struct *owner) /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); -#endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index f9aabbc9d22ef..c40115d4e34dc 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -29,15 +29,11 @@ static inline bool dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } -#ifdef CONFIG_SMP - struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); -#endif /* CONFIG_SMP */ - extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 439f6029d3b94..8465ff1f20d1f 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -11,11 +11,7 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; -#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); -#else -static inline void wake_up_if_idle(int cpu) { } -#endif /* * Idle thread specific functions to determine the need_resched diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 6d67e9a5af6bb..0db7f67935fed 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -6,7 +6,7 @@ * This is the interface between the scheduler and nohz/dynticks: */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_balance_enter_idle(int cpu); extern int get_nohz_timer_target(void); #else @@ -23,7 +23,7 @@ static inline void calc_load_nohz_remote(struct rq *rq) { } static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +#ifdef CONFIG_NO_HZ_COMMON extern void wake_up_nohz_cpu(int cpu); #else static inline void wake_up_nohz_cpu(int cpu) { } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ca1db4b92c324..c517dbc242f7c 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -109,11 +109,7 @@ int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ -#ifdef CONFIG_SMP extern void sched_exec(void); -#else -#define sched_exec() {} -#endif static inline struct task_struct *get_task_struct(struct task_struct *t) { diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 198bb5cc1774b..e54e7fa76ba63 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -9,7 +9,6 @@ /* * sched-domains (multiprocessor balancing) declarations: */ -#ifdef CONFIG_SMP /* Generate SD flag indexes */ #define SD_FLAG(name, mflags) __##name, @@ -200,37 +199,6 @@ extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); # define SD_INIT_NAME(type) .name = #type -#else /* CONFIG_SMP */ - -struct sched_domain_attr; - -static inline void -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_cache(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_resources(int this_cpu, int that_cpu) -{ - return true; -} - -static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) -{ -} - -#endif /* !CONFIG_SMP */ - #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern void rebuild_sched_domains_energy(void); #else diff --git a/include/linux/smp.h b/include/linux/smp.h index f1aa0952e8c30..bea8d2826e09f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -234,7 +234,7 @@ static inline int get_boot_cpu_id(void) #endif /* !SMP */ /** - * raw_processor_id() - get the current (unstable) CPU id + * raw_smp_processor_id() - get the current (unstable) CPU id * * For then you know what you are doing and need an unstable * CPU id. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e5603cc91963d..b0cc60f1c458a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -998,6 +998,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c390946..08ef78439d0df 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -185,8 +186,14 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *insn, int nbytes, void *data); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -194,7 +201,11 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 43dec6eed559a..9785c1d49f05e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -255,7 +255,12 @@ struct prctl_mm_map { /* Dispatch syscalls to a userspace handler */ #define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 +/* Enable dispatch except for the specified range */ +# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 +/* Enable dispatch for the specified range */ +# define PR_SYS_DISPATCH_INCLUSIVE_ON 2 +/* Legacy name for backwards compatibility */ +# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON /* The control values for the user space selector when dispatch is enabled */ # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 diff --git a/init/main.c b/init/main.c index 225a58279acd7..f9f401b6fdfbc 100644 --- a/init/main.c +++ b/init/main.c @@ -53,7 +53,6 @@ #include #include #include -#include #include #include #include @@ -1068,10 +1067,6 @@ void start_kernel(void) pid_idr_init(); anon_vma_init(); -#ifdef CONFIG_X86 - if (efi_enabled(EFI_RUNTIME_SERVICES)) - efi_enter_virtual_mode(); -#endif thread_stack_cache_init(); cred_init(); fork_init(); diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index e64ce21f9a805..2ee603a98813e 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -134,6 +134,7 @@ config CRASH_DM_CRYPT depends on KEXEC_FILE depends on CRASH_DUMP depends on DM_CRYPT + depends on KEYS help With this option enabled, user space can intereact with /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 335b8425dd4b9..a4ef79591eb24 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,11 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +/* time to wait for possible DMA to finish before starting the kdump kernel + * when a CMA reservation is used + */ +#define CMA_DMA_TIMEOUT_SEC 10 + #ifdef CONFIG_CRASH_DUMP int kimage_crash_copy_vmcoreinfo(struct kimage *image) @@ -97,6 +103,14 @@ int kexec_crash_loaded(void) } EXPORT_SYMBOL_GPL(kexec_crash_loaded); +static void crash_cma_clear_pending_dma(void) +{ + if (!crashk_cma_cnt) + return; + + mdelay(CMA_DMA_TIMEOUT_SEC * 1000); +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU @@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); + crash_cma_clear_pending_dma(); machine_kexec(kexec_crash_image); } kexec_unlock(); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index acb6bf42e30d3..87bf4d41eabba 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -172,17 +174,19 @@ static int __init parse_crashkernel_simple(char *cmdline, #define SUFFIX_HIGH 0 #define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 +#define SUFFIX_CMA 2 +#define SUFFIX_NULL 3 static __initdata char *suffix_tbl[] = { [SUFFIX_HIGH] = ",high", [SUFFIX_LOW] = ",low", + [SUFFIX_CMA] = ",cma", [SUFFIX_NULL] = NULL, }; /* * That function parses "suffix" crashkernel command lines like * - * crashkernel=size,[high|low] + * crashkernel=size,[high|low|cma] * * It returns 0 on success and -EINVAL on failure. */ @@ -298,9 +302,11 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, + unsigned long long *cma_size, bool *high) { int ret; + unsigned long long __always_unused cma_base; /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, @@ -331,6 +337,14 @@ int __init parse_crashkernel(char *cmdline, *high = true; } + + /* + * optional CMA reservation + * cma_base is ignored + */ + if (cma_size) + __parse_crashkernel(cmdline, 0, cma_size, + &cma_base, suffix_tbl[SUFFIX_CMA]); #endif if (!*crash_size) ret = -EINVAL; @@ -457,6 +471,56 @@ void __init reserve_crashkernel_generic(unsigned long long crash_size, #endif } +struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX]; +#ifdef CRASHKERNEL_CMA +int crashk_cma_cnt; +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + unsigned long long request_size = roundup(cma_size, PAGE_SIZE); + unsigned long long reserved_size = 0; + + if (!cma_size) + return; + + while (cma_size > reserved_size && + crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) { + + struct cma *res; + + if (cma_declare_contiguous(0, request_size, 0, 0, 0, false, + "crashkernel", &res)) { + /* reservation failed, try half-sized blocks */ + if (request_size <= PAGE_SIZE) + break; + + request_size = roundup(request_size / 2, PAGE_SIZE); + continue; + } + + crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res); + crashk_cma_ranges[crashk_cma_cnt].end = + crashk_cma_ranges[crashk_cma_cnt].start + + cma_get_size(res) - 1; + ++crashk_cma_cnt; + reserved_size += request_size; + } + + if (cma_size > reserved_size) + pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n", + cma_size >> 20, reserved_size >> 20, crashk_cma_cnt); + else + pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n", + reserved_size >> 20, crashk_cma_cnt); +} + +#else /* CRASHKERNEL_CMA */ +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + if (cma_size) + pr_warn("crashkernel CMA reservation not supported\n"); +} +#endif + #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 5340c5aa89e7d..a9055eccb27e1 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon if (offset || len || selector) return -EINVAL; break; - case PR_SYS_DISPATCH_ON: + case PR_SYS_DISPATCH_EXCLUSIVE_ON: /* * Validate the direct dispatcher region just for basic * sanity against overflow and a 0-sized dispatcher @@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon */ if (offset && offset + len <= offset) return -EINVAL; - + break; + case PR_SYS_DISPATCH_INCLUSIVE_ON: + if (len == 0 || offset + len <= offset) + return -EINVAL; /* - * access_ok() will clear memory tags for tagged addresses - * if current has memory tagging enabled. - - * To enable a tracer to set a tracees selector the - * selector address must be untagged for access_ok(), - * otherwise an untagged tracer will always fail to set a - * tagged tracees selector. + * Invert the range, the check in syscall_user_dispatch() + * supports wrap-around. */ - if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) - return -EFAULT; - + offset = offset + len; + len = -len; break; default: return -EINVAL; } + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + * + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (mode != PR_SYS_DISPATCH_OFF && selector && + !access_ok(untagged_addr(selector), sizeof(*selector))) + return -EFAULT; + task->syscall_dispatch.selector = selector; task->syscall_dispatch.offset = offset; task->syscall_dispatch.len = len; task->syscall_dispatch.on_dispatch = false; - if (mode == PR_SYS_DISPATCH_ON) + if (mode != PR_SYS_DISPATCH_OFF) set_task_syscall_work(task, SYSCALL_USER_DISPATCH); else clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4c965ba77f9f8..e54081beeab9f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -483,23 +484,32 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * - * Called with mm->mmap_lock held for read or write. + * Called with mm->mmap_lock held for write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) +{ + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -510,7 +520,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -522,14 +532,14 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -561,7 +571,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -581,8 +591,8 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) + update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) @@ -603,7 +613,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -619,7 +629,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ @@ -1052,7 +1062,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1398,7 +1408,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -1464,7 +1474,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) struct vm_area_struct *vma; int err = 0; - mmap_read_lock(mm); + mmap_write_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1481,7 +1491,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, vma, vaddr); } - mmap_read_unlock(mm); + mmap_write_unlock(mm); return err; } @@ -1727,7 +1737,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1759,7 +1769,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) @@ -1793,6 +1803,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1804,6 +1822,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; @@ -2394,7 +2414,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ @@ -2678,6 +2698,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2742,6 +2766,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; @@ -2753,6 +2780,23 @@ static void handle_swbp(struct pt_regs *regs) rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/exit.c b/kernel/exit.c index bb184a67ac73c..f03caf17b214b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -692,12 +692,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, } /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38ba..70f2d4e2e8fef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -188,33 +188,33 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +/* + * Allocated stacks are cached and later reused by new threads, so memcg + * accounting is performed by the code assigning/releasing stacks to tasks. + * We need a zeroed memory without __GFP_ACCOUNT. + */ +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } return false; @@ -223,11 +223,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -240,32 +241,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -273,55 +274,47 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (!vm_area) continue; /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); return -ENOMEM; } - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } - /* - * Allocated stacks are cached and later reused by new threads, - * so memcg accounting is performed manually on assigning/releasing - * stacks to tasks. Drop __GFP_ACCOUNT. - */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, - THREADINFO_GFP & ~__GFP_ACCOUNT, + GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -330,7 +323,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -345,7 +338,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -377,8 +376,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -417,7 +415,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -437,11 +436,11 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -457,12 +456,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } @@ -1010,6 +1009,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3f02a0e452541..1da5e9d9da719 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -144,6 +144,17 @@ config GENERIC_IRQ_DEBUGFS config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD bool +config IRQ_KUNIT_TEST + bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS + depends on KUNIT=y + default KUNIT_ALL_TESTS + imply SMP + help + This option enables KUnit tests for the IRQ subsystem API. These are + only for development and testing, not for regular kernel use cases. + + If unsure, say N. + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index c0f44c06d69df..6ab3a40556670 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o +obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c new file mode 100644 index 0000000000000..5161b56a12f93 --- /dev/null +++ b/kernel/irq/irq_test.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: LGPL-2.1+ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +static irqreturn_t noop_handler(int irq, void *data) +{ + return IRQ_HANDLED; +} + +static void noop(struct irq_data *data) { } +static unsigned int noop_ret(struct irq_data *data) { return 0; } + +static int noop_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + irq_data_update_effective_affinity(data, dest); + + return 0; +} + +static struct irq_chip fake_irq_chip = { + .name = "fake", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + .irq_set_affinity = noop_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_free_disabled_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + free_irq(virq, NULL); + KUNIT_EXPECT_GE(test, desc->depth, 1); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_shutdown_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + .mask = CPU_MASK_ALL, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + irq_shutdown_and_deactivate(desc); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + + KUNIT_EXPECT_EQ(test, irq_activate(desc), 0); +#ifdef CONFIG_SMP + irq_startup_managed(desc); +#endif + + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_cpuhotplug_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for CPU hotplug"); + if (!get_cpu_device(1)) + kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); + if (!cpu_is_hotpluggable(1)) + kunit_skip(test, "CPU 1 must be hotpluggable"); + + cpumask_copy(&affinity.mask, cpumask_of(1)); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_GE(test, desc->depth, 1); + KUNIT_EXPECT_EQ(test, add_cpu(1), 0); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static struct kunit_case irq_test_cases[] = { + KUNIT_CASE(irq_disable_depth_test), + KUNIT_CASE(irq_free_disabled_test), + KUNIT_CASE(irq_shutdown_depth_test), + KUNIT_CASE(irq_cpuhotplug_test), + {} +}; + +static struct kunit_suite irq_test_suite = { + .name = "irq_test_cases", + .test_cases = irq_test_cases, +}; + +kunit_test_suite(irq_test_suite); +MODULE_DESCRIPTION("IRQ unit test suite"); +MODULE_LICENSE("GPL"); diff --git a/kernel/kcov.c b/kernel/kcov.c index 187ba1b80bda1..1d85597057e18 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg) /* * Fault in a lazily-faulted vmalloc area before it can be used by - * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the * vmalloc fault handling path is instrumented. */ static void kcov_fault_in_area(struct kcov *kcov) diff --git a/kernel/kthread.c b/kernel/kthread.c index 85fc068f0083d..0e98b228a8efd 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k) /* * Variant of to_kthread() that doesn't assume @p is a kthread. * - * Per construction; when: + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. * - * (p->flags & PF_KTHREAD) && p->worker_private - * - * the task is both a kthread and struct kthread is persistent. However - * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and - * begin_new_exec()). + * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { diff --git a/kernel/panic.c b/kernel/panic.c index b0b9a8bf4560d..8cb3649a79b61 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -753,13 +753,15 @@ void __warn(const char *file, int line, void *caller, unsigned taint, disable_trace_on_warning(); - if (file) - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", - raw_smp_processor_id(), current->pid, file, line, - caller); - else - pr_warn("WARNING: CPU: %d PID: %d at %pS\n", - raw_smp_processor_id(), current->pid, caller); + if (file) { + pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n", + file, line, caller, + raw_smp_processor_id(), current->comm, current->pid); + } else { + pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n", + caller, + raw_smp_processor_id(), current->comm, current->pid); + } #pragma GCC diagnostic push #ifndef __clang__ diff --git a/kernel/relay.c b/kernel/relay.c index c0c93a04d4ce9..8d915fe981989 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) return NULL; for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); + buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); if (unlikely(!buf->page_array[i])) goto depopulate; set_page_private(buf->page_array[i], (unsigned long)buf); @@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) if (!mem) goto depopulate; - memset(mem, 0, *size); buf->page_count = n_pages; return mem; @@ -250,13 +249,18 @@ EXPORT_SYMBOL_GPL(relay_buf_full); */ static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { + int full = relay_buf_full(buf); + + if (full) + buf->stats.full_count++; + if (!buf->chan->cb->subbuf_start) - return !relay_buf_full(buf); + return !full; return buf->chan->cb->subbuf_start(buf, subbuf, - prev_subbuf, prev_padding); + prev_subbuf); } /** @@ -298,11 +302,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->finalized = 0; buf->data = buf->start; buf->offset = 0; + buf->stats.full_count = 0; + buf->stats.big_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - relay_subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL); } /** @@ -555,9 +561,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) goto toobig; if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; + size_t prev_padding; + + prev_padding = buf->chan->subbuf_size - buf->offset; old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; + buf->padding[old_subbuf] = prev_padding; buf->subbufs_produced++; if (buf->dentry) d_inode(buf->dentry)->i_size += @@ -582,7 +590,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } @@ -595,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) return length; toobig: - buf->chan->last_toobig = length; + buf->stats.big_count++; return 0; } EXPORT_SYMBOL_GPL(relay_switch_subbuf); @@ -655,11 +663,6 @@ void relay_close(struct rchan *chan) if ((buf = *per_cpu_ptr(chan->buf, i))) relay_close_buf(buf); - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%zd) > sub-buffer size (%zd)]\n", - chan->last_toobig, chan->subbuf_size); - list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); @@ -693,6 +696,42 @@ void relay_flush(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_flush); +/** + * relay_stats - get channel buffer statistics + * @chan: the channel + * @flags: select particular information to get + * + * Returns the count of certain field that caller specifies. + */ +size_t relay_stats(struct rchan *chan, int flags) +{ + unsigned int i, count = 0; + struct rchan_buf *rbuf; + + if (!chan || flags > RELAY_STATS_LAST) + return 0; + + if (chan->is_global) { + rbuf = *per_cpu_ptr(chan->buf, 0); + if (flags & RELAY_STATS_BUF_FULL) + count = rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count = rbuf->stats.big_count; + } else { + for_each_online_cpu(i) { + rbuf = *per_cpu_ptr(chan->buf, i); + if (rbuf) { + if (flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count += rbuf->stats.big_count; + } + } + } + + return count; +} + /** * relay_file_open - open file op for relay files * @inode: the inode diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2b331822c7e77..cdea931aae306 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -4,6 +4,9 @@ * Auto-group scheduling implementation: */ +#include "autogroup.h" +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; @@ -25,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void) { register_sysctl_init("kernel", sched_autogroup_sysctls); } -#else +#else /* !CONFIG_SYSCTL: */ #define sched_autogroup_sysctl_init() do { } while (0) -#endif +#endif /* !CONFIG_SYSCTL */ void __init autogroup_init(struct task_struct *init_task) { @@ -108,7 +111,7 @@ static inline struct autogroup *autogroup_create(void) free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; tg->rt_rq = root_task_group.rt_rq; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ tg->autogroup = ag; sched_online_group(tg, &root_task_group); diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 90d69f2c5eafd..06c82b2bdfb54 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -2,6 +2,8 @@ #ifndef _KERNEL_SCHED_AUTOGROUP_H #define _KERNEL_SCHED_AUTOGROUP_H +#include "sched.h" + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { @@ -41,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) extern int autogroup_path(struct task_group *tg, char *buf, int buflen); -#else /* !CONFIG_SCHED_AUTOGROUP */ +#else /* !CONFIG_SCHED_AUTOGROUP: */ static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } @@ -61,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) return 0; } -#endif /* CONFIG_SCHED_AUTOGROUP */ +#endif /* !CONFIG_SCHED_AUTOGROUP */ #endif /* _KERNEL_SCHED_AUTOGROUP_H */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 72d97aa8b7260..c4a488e67aa7d 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -50,11 +50,9 @@ #include "idle.c" #include "rt.c" +#include "cpudeadline.c" -#ifdef CONFIG_SMP -# include "cpudeadline.c" -# include "pelt.c" -#endif +#include "pelt.c" #include "cputime.c" #include "deadline.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index bf9d8db94b700..e2cf3b08d4e95 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -80,11 +80,10 @@ #include "wait_bit.c" #include "wait.c" -#ifdef CONFIG_SMP -# include "cpupri.c" -# include "stop_task.c" -# include "topology.c" -#endif +#include "cpupri.c" +#include "stop_task.c" + +#include "topology.c" #ifdef CONFIG_SCHED_CORE # include "core_sched.c" diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a09655b481402..f5e6dd6a6b3af 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -54,6 +54,9 @@ * */ +#include +#include "sched.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. @@ -471,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ void __init sched_clock_init(void) { @@ -489,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * Running clock - returns the time that has elapsed while a guest has been diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 3561ab533dd4e..19ee702273c0f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,11 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +#include +#include +#include +#include "sched.h" + static void complete_with_flags(struct completion *x, int wake_flags) { unsigned long flags; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8988d38d46a38..0e3a00e2a2cca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -481,13 +481,13 @@ void sched_core_put(void) schedule_work(&_work); } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); @@ -650,7 +650,6 @@ void raw_spin_rq_unlock(struct rq *rq) raw_spin_unlock(rq_lockp(rq)); } -#ifdef CONFIG_SMP /* * double_rq_lock - safely lock two runqueues */ @@ -667,7 +666,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif /* * __task_rq_lock - lock the rq @p resides on. @@ -853,8 +851,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -#ifdef CONFIG_SMP - static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; @@ -899,33 +895,12 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and IRQs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - delay = max_t(u64, delay, 10000LL); - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_HARD); -} - -#endif /* CONFIG_SMP */ - static void hrtick_rq_init(struct rq *rq) { -#ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -#endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } -#else /* CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_clear(struct rq *rq) { } @@ -933,7 +908,7 @@ static inline void hrtick_clear(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq) { } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ /* * try_cmpxchg based fetch_or() macro so it works for different integer types: @@ -949,7 +924,7 @@ static inline void hrtick_rq_init(struct rq *rq) _val; \ }) -#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +#ifdef TIF_POLLING_NRFLAG /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -988,13 +963,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) return true; } -#ifdef CONFIG_SMP static inline bool set_nr_if_polling(struct task_struct *p) { return false; } #endif -#endif static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { @@ -1167,7 +1140,6 @@ void resched_cpu(int cpu) raw_spin_rq_unlock_irqrestore(rq, flags); } -#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy CPU for migrating timers @@ -1374,10 +1346,8 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ -#endif /* CONFIG_SMP */ -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. @@ -1971,7 +1941,7 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; } -#endif +#endif /* CONFIG_SYSCTL */ static void uclamp_fork(struct task_struct *p) { @@ -2037,13 +2007,13 @@ static void __init init_uclamp(void) } } -#else /* !CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK: */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } -#endif /* CONFIG_UCLAMP_TASK */ +#endif /* !CONFIG_UCLAMP_TASK */ bool sched_task_on_rq(struct task_struct *p) { @@ -2353,8 +2323,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state return ncsw; } -#ifdef CONFIG_SMP - static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); @@ -3305,6 +3273,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) WARN_ON_ONCE(ret); } +#ifdef CONFIG_SMP + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { unsigned int state = READ_ONCE(p->__state); @@ -3358,6 +3328,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) @@ -3661,17 +3632,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* CONFIG_SMP */ - -static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } - -static inline bool rq_has_pinned_tasks(struct rq *rq) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -3682,7 +3642,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) rq = this_rq(); -#ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); __schedstat_inc(p->stats.nr_wakeups_local); @@ -3702,7 +3661,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) __schedstat_inc(p->stats.nr_wakeups_migrate); -#endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); __schedstat_inc(p->stats.nr_wakeups); @@ -3731,13 +3689,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (p->sched_contributes_to_load) rq->nr_uninterruptible--; -#ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else -#endif if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); @@ -3748,7 +3704,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ttwu_do_wakeup(p); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Our task @p is fully woken up and running; so it's safe to @@ -3770,7 +3725,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif } /* @@ -3824,7 +3778,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) return ret; } -#ifdef CONFIG_SMP void sched_ttwu_pending(void *arg) { struct llist_node *llist = arg; @@ -3891,7 +3844,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); +#ifdef CONFIG_SMP __smp_call_single_queue(cpu, &p->wake_entry.llist); +#endif } void wake_up_if_idle(int cpu) @@ -3992,15 +3947,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP */ - -static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -{ - return false; -} - -#endif /* CONFIG_SMP */ - static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -4256,7 +4202,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) break; -#ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -4335,9 +4280,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else - cpu = task_cpu(p); -#endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } @@ -4370,14 +4312,12 @@ static bool __task_needs_rq_lock(struct task_struct *p) if (p->on_rq) return true; -#ifdef CONFIG_SMP /* * Ensure the task has finished __schedule() and will not be referenced * anymore. Again, see try_to_wake_up() for a longer comment. */ smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); -#endif return false; } @@ -4533,10 +4473,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); -#ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; -#endif init_sched_mm_cid(p); } @@ -4599,8 +4537,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write, } return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SCHEDSTATS @@ -4787,14 +4725,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) p->on_cpu = 0; -#endif init_task_preempt_count(p); -#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -#endif + return 0; } @@ -4871,7 +4806,6 @@ void wake_up_new_task(struct task_struct *p) raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); -#ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_ptr can change in the fork path @@ -4883,7 +4817,6 @@ void wake_up_new_task(struct task_struct *p) p->recent_used_cpu = task_cpu(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); -#endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); @@ -4891,7 +4824,6 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, wake_flags); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Nothing relies on rq->lock after this, so it's fine to @@ -4901,7 +4833,6 @@ void wake_up_new_task(struct task_struct *p) p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); } -#endif task_rq_unlock(rq, p, &rf); } @@ -4978,7 +4909,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(curr, next); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ +#else /* !CONFIG_PREEMPT_NOTIFIERS: */ static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -4990,11 +4921,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif /* CONFIG_PREEMPT_NOTIFIERS */ +#endif /* !CONFIG_PREEMPT_NOTIFIERS */ static inline void prepare_task(struct task_struct *next) { -#ifdef CONFIG_SMP /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. @@ -5003,12 +4933,10 @@ static inline void prepare_task(struct task_struct *next) * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); -#endif } static inline void finish_task(struct task_struct *prev) { -#ifdef CONFIG_SMP /* * This must be the very last reference to @prev from this CPU. After * p->on_cpu is cleared, the task can be moved to a different CPU. We @@ -5021,11 +4949,8 @@ static inline void finish_task(struct task_struct *prev) * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); -#endif } -#ifdef CONFIG_SMP - static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5107,14 +5032,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else - -static inline void __balance_callbacks(struct rq *rq) -{ -} - -#endif - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -5502,8 +5419,6 @@ unsigned int nr_iowait(void) return sum; } -#ifdef CONFIG_SMP - /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. @@ -5527,8 +5442,6 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif - DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); @@ -5563,7 +5476,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +#ifdef CONFIG_64BIT /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. @@ -5690,12 +5603,10 @@ void sched_tick(void) if (donor->flags & PF_WQ_WORKER) wq_worker_tick(donor); -#ifdef CONFIG_SMP if (!scx_switched_all()) { rq->idle_balance = idle_cpu(cpu); sched_balance_trigger(rq); } -#endif } #ifdef CONFIG_NO_HZ_FULL @@ -5835,10 +5746,10 @@ int __init sched_tick_offload_init(void) return 0; } -#else /* !CONFIG_NO_HZ_FULL */ +#else /* !CONFIG_NO_HZ_FULL: */ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) @@ -6553,7 +6464,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) rq->core = rq; } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_cpu_starting(unsigned int cpu) {} static inline void sched_core_cpu_deactivate(unsigned int cpu) {} @@ -6565,7 +6476,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return __pick_next_task(rq, prev, rf); } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* * Constants for the sched_mode argument of __schedule(). @@ -6992,14 +6903,14 @@ NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_dynamic_enabled -#define preempt_schedule_dynamic_enabled preempt_schedule -#define preempt_schedule_dynamic_disabled NULL -#endif +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# ifndef preempt_schedule_dynamic_enabled +# define preempt_schedule_dynamic_enabled preempt_schedule +# define preempt_schedule_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); void __sched notrace dynamic_preempt_schedule(void) { @@ -7009,8 +6920,8 @@ void __sched notrace dynamic_preempt_schedule(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -7065,14 +6976,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_notrace_dynamic_enabled -#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -#define preempt_schedule_notrace_dynamic_disabled NULL -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# ifndef preempt_schedule_notrace_dynamic_enabled +# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +# define preempt_schedule_notrace_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); void __sched notrace dynamic_preempt_schedule_notrace(void) { @@ -7082,7 +6993,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -#endif +# endif #endif #endif /* CONFIG_PREEMPTION */ @@ -7301,7 +7212,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) preempt_enable(); } -#endif +#endif /* CONFIG_RT_MUTEXES */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) @@ -7332,17 +7243,17 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define cond_resched_dynamic_enabled __cond_resched -#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# define cond_resched_dynamic_enabled __cond_resched +# define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); -#define might_resched_dynamic_enabled __cond_resched -#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +# define might_resched_dynamic_enabled __cond_resched +# define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { @@ -7360,8 +7271,8 @@ int __sched dynamic_might_resched(void) return __cond_resched(); } EXPORT_SYMBOL(dynamic_might_resched); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -7427,9 +7338,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC -#ifdef CONFIG_GENERIC_ENTRY -#include -#endif +# ifdef CONFIG_GENERIC_ENTRY +# include +# endif /* * SC:cond_resched @@ -7484,37 +7395,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -#ifndef CONFIG_PREEMPT_RT +# ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; -#endif +# endif if (!strcmp(str, "full")) return preempt_dynamic_full; -#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY if (!strcmp(str, "lazy")) return preempt_dynamic_lazy; -#endif +# endif return -EINVAL; } -#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) -#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) -#else -#error "Unsupported PREEMPT_DYNAMIC mechanism" -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) +# else +# error "Unsupported PREEMPT_DYNAMIC mechanism" +# endif static DEFINE_MUTEX(sched_dynamic_mutex); @@ -7618,7 +7529,7 @@ static void __init preempt_dynamic_init(void) } } -#define PREEMPT_MODEL_ACCESSOR(mode) \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ bool preempt_model_##mode(void) \ { \ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ @@ -7819,12 +7730,10 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { -#ifdef CONFIG_SMP struct affinity_context ac = (struct affinity_context) { .new_mask = cpumask_of(cpu), .flags = 0, }; -#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -7840,13 +7749,11 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); -#ifdef CONFIG_SMP /* * No validation and serialization required at boot time and for * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); -#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the CPU isn't yet set to this CPU so the @@ -7865,9 +7772,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; -#ifdef CONFIG_SMP idle->on_cpu = 1; -#endif raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); @@ -7880,13 +7785,9 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif } -#ifdef CONFIG_SMP - int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -8123,7 +8024,7 @@ static void balance_hotplug_wait(void) TASK_UNINTERRUPTIBLE); } -#else +#else /* !CONFIG_HOTPLUG_CPU: */ static inline void balance_push(struct rq *rq) { @@ -8137,7 +8038,7 @@ static inline void balance_hotplug_wait(void) { } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* !CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) { @@ -8446,7 +8347,7 @@ int sched_cpu_dying(unsigned int cpu) sched_core_cpu_dying(cpu); return 0; } -#endif +#endif /* CONFIG_HOTPLUG_CPU */ void __init sched_init_smp(void) { @@ -8480,13 +8381,6 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* CONFIG_SMP */ - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8512,9 +8406,7 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ -#ifdef CONFIG_SMP BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -#endif BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); @@ -8557,9 +8449,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_SMP init_defrootdomain(); -#endif #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -8620,7 +8510,6 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif -#ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8646,7 +8535,6 @@ void __init sched_init(void) #ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); #endif -#endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); @@ -8694,10 +8582,9 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; -#ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); -#endif init_sched_fair_class(); init_sched_ext_class(); @@ -8830,7 +8717,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset) } EXPORT_SYMBOL_GPL(__cant_sleep); -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP void __cant_migrate(const char *file, int line) { static unsigned long prev_jiffy; @@ -8861,8 +8748,8 @@ void __cant_migrate(const char *file, int line) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_migrate); -#endif -#endif +# endif /* CONFIG_SMP */ +#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) @@ -8902,7 +8789,7 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_KGDB_KDB) +#ifdef CONFIG_KGDB_KDB /* * These functions are only useful for KDB. * @@ -8926,7 +8813,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_KGDB_KDB) */ +#endif /* CONFIG_KGDB_KDB */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -9422,47 +9309,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); -const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ -/* More than 203 days if BW_SHIFT equals 20. */ -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; - static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, - u64 burst) +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 period, quota, burst; - if (tg == &root_task_group) - return -EINVAL; + period = (u64)period_us * NSEC_PER_USEC; - /* - * Ensure we have at some amount of bandwidth every period. This is - * to prevent reaching a state of large arrears when throttled via - * entity_tick() resulting in prolonged exit starvation. - */ - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) - return -EINVAL; - - /* - * Likewise, bound things on the other side by preventing insane quota - * periods. This also allows us to normalize in computing quota - * feasibility. - */ - if (period > max_cfs_quota_period) - return -EINVAL; - - /* - * Bound quota to defend quota against overflow during bandwidth shift. - */ - if (quota != RUNTIME_INF && quota > max_cfs_runtime) - return -EINVAL; + if (quota_us == RUNTIME_INF) + quota = RUNTIME_INF; + else + quota = (u64)quota_us * NSEC_PER_USEC; - if (quota != RUNTIME_INF && (burst > quota || - burst + quota > max_cfs_runtime)) - return -EINVAL; + burst = (u64)burst_us * NSEC_PER_USEC; /* * Prevent race between setting of cfs_rq->runtime_enabled and @@ -9517,28 +9380,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static u64 tg_get_cfs_period(struct task_group *tg) { - u64 quota, period, burst; + u64 cfs_period_us; - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); - return tg_set_cfs_bandwidth(tg, period, quota, burst); + return cfs_period_us; } -static long tg_get_cfs_quota(struct task_group *tg) +static u64 tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; if (tg->cfs_bandwidth.quota == RUNTIME_INF) - return -1; + return RUNTIME_INF; quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); @@ -9546,45 +9403,7 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; - burst = tg->cfs_bandwidth.burst; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_period(struct task_group *tg) -{ - u64 cfs_period_us; - - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); - do_div(cfs_period_us, NSEC_PER_USEC); - - return cfs_period_us; -} - -static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - burst = (u64)cfs_burst_us * NSEC_PER_USEC; - period = ktime_to_ns(tg->cfs_bandwidth.period); - quota = tg->cfs_bandwidth.quota; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static long tg_get_cfs_burst(struct task_group *tg) +static u64 tg_get_cfs_burst(struct task_group *tg) { u64 burst_us; @@ -9594,42 +9413,6 @@ static long tg_get_cfs_burst(struct task_group *tg) return burst_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_quota(css_tg(css)); -} - -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) -{ - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); -} - -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_period(css_tg(css)); -} - -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) -{ - return tg_set_cfs_period(css_tg(css), cfs_period_us); -} - -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_burst(css_tg(css)); -} - -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) -{ - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9762,6 +9545,126 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } + +const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ +static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_bw_runtime_us = MAX_BW; + +static void tg_bandwidth(struct task_group *tg, + u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) +{ + if (period_us_p) + *period_us_p = tg_get_cfs_period(tg); + if (quota_us_p) + *quota_us_p = tg_get_cfs_quota(tg); + if (burst_us_p) + *burst_us_p = tg_get_cfs_burst(tg); +} + +static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 period_us; + + tg_bandwidth(css_tg(css), &period_us, NULL, NULL); + return period_us; +} + +static int tg_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + const u64 max_usec = U64_MAX / NSEC_PER_USEC; + + if (tg == &root_task_group) + return -EINVAL; + + /* Values should survive translation to nsec */ + if (period_us > max_usec || + (quota_us != RUNTIME_INF && quota_us > max_usec) || + burst_us > max_usec) + return -EINVAL; + + /* + * Ensure we have some amount of bandwidth every period. This is to + * prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota_us < min_bw_quota_period_us || + period_us < min_bw_quota_period_us) + return -EINVAL; + + /* + * Likewise, bound things on the other side by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period_us > max_bw_quota_period_us) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) + return -EINVAL; + + if (quota_us != RUNTIME_INF && (burst_us > quota_us || + burst_us + quota_us > max_bw_runtime_us)) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +} + +static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 quota_us; + + tg_bandwidth(css_tg(css), NULL, "a_us, NULL); + return quota_us; /* (s64)RUNTIME_INF becomes -1 */ +} + +static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 burst_us; + + tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); + return burst_us; +} + +static int cpu_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period_us) +{ + struct task_group *tg = css_tg(css); + u64 quota_us, burst_us; + + tg_bandwidth(tg, NULL, "a_us, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 quota_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, burst_us; + + if (quota_us < 0) + quota_us = RUNTIME_INF; + + tg_bandwidth(tg, &period_us, NULL, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} + +static int cpu_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 burst_us) +{ + struct task_group *tg = css_tg(css); + u64 period_us, quota_us; + + tg_bandwidth(tg, &period_us, "a_us, NULL); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); +} #endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9807,7 +9710,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, scx_group_set_idle(css_tg(css), idle); return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -9824,19 +9727,19 @@ static struct cftype cpu_legacy_files[] = { #endif #ifdef CONFIG_CFS_BANDWIDTH { - .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, + .name = "cfs_period_us", + .read_u64 = cpu_period_read_u64, + .write_u64 = cpu_period_write_u64, }, { - .name = "cfs_period_us", - .read_u64 = cpu_cfs_period_read_u64, - .write_u64 = cpu_cfs_period_write_u64, + .name = "cfs_quota_us", + .read_s64 = cpu_quota_read_s64, + .write_s64 = cpu_quota_write_s64, }, { .name = "cfs_burst_us", - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, { .name = "stat", @@ -9935,7 +9838,7 @@ static int cpu_extra_stat_show(struct seq_file *sf, cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec, cfs_b->nr_burst, burst_usec); } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ return 0; } @@ -10033,22 +9936,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, } /* caller should put the current value in *@periodp before calling */ -static int __maybe_unused cpu_period_quota_parse(char *buf, - u64 *periodp, u64 *quotap) +static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, + u64 *quota_us_p) { char tok[21]; /* U64_MAX */ - if (sscanf(buf, "%20s %llu", tok, periodp) < 1) + if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) return -EINVAL; - *periodp *= NSEC_PER_USEC; - - if (sscanf(tok, "%llu", quotap)) - *quotap *= NSEC_PER_USEC; - else if (!strcmp(tok, "max")) - *quotap = RUNTIME_INF; - else - return -EINVAL; + if (sscanf(tok, "%llu", quota_us_p) < 1) { + if (!strcmp(tok, "max")) + *quota_us_p = RUNTIME_INF; + else + return -EINVAL; + } return 0; } @@ -10057,8 +9958,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); + u64 period_us, quota_us; - cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + tg_bandwidth(tg, &period_us, "a_us, NULL); + cpu_period_quota_print(sf, period_us, quota_us); return 0; } @@ -10066,17 +9969,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct task_group *tg = css_tg(of_css(of)); - u64 period = tg_get_cfs_period(tg); - u64 burst = tg->cfs_bandwidth.burst; - u64 quota; + u64 period_us, quota_us, burst_us; int ret; - ret = cpu_period_quota_parse(buf, &period, "a); + tg_bandwidth(tg, &period_us, NULL, &burst_us); + ret = cpu_period_quota_parse(buf, &period_us, "a_us); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota, burst); + ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); return ret ?: nbytes; } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ static struct cftype cpu_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -10109,10 +10011,10 @@ static struct cftype cpu_files[] = { { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .read_u64 = cpu_burst_read_u64, + .write_u64 = cpu_burst_write_u64, }, -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -10126,7 +10028,7 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, -#endif +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* terminate */ }; @@ -10147,7 +10049,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .threaded = true, }; -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ void dump_cpu_task(int cpu) { @@ -10733,7 +10635,7 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!t->mm || t->mm_cid != -1); t->mm_cid_active = 1; } -#endif +#endif /* CONFIG_SCHED_MM_CID */ #ifdef CONFIG_SCHED_CLASS_EXT void sched_deq_and_put_task(struct task_struct *p, int queue_flags, @@ -10768,4 +10670,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) if (ctx->running) set_next_task(rq, ctx->p); } -#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c4606ca892102..9ede71ecba7f8 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -4,6 +4,8 @@ * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. */ +#include "sched.h" + struct sched_core_cookie { refcount_t refcnt; }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda099496..23a56ba12d811 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,8 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 95baa12a10293..cdd740b3f7743 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -6,6 +6,7 @@ * * Author: Juri Lelli */ +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 0adeda93b5fb5..11c0f1faa7e11 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include +#include #define IDX_INVALID -1 @@ -15,7 +17,6 @@ struct cpudl { struct cpudl_item *elements; }; -#ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); @@ -23,4 +24,3 @@ int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5252fb191faea..742fb9e62e1af 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,7 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki */ +#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 461242ec958af..0ab5f9d4bc59a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki */ +#include +#include "sched.h" #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) @@ -380,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 42c40cfdf8363..76a9ac5eb794a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,6 +22,7 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ +#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index d6cba0020064c..6f562088c0565 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) @@ -17,7 +20,6 @@ struct cpupri { int *cpu_to_pri; }; -#ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, @@ -26,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6dab4854c6c08..7097de2c8cda2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,6 +2,9 @@ /* * Simple CPU accounting cgroup controller */ +#include +#include +#include "sched.h" #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include @@ -88,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime) return delta; } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static u64 irqtime_tick_accounted(u64 dummy) { @@ -241,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) task_group_account_field(p, CPUTIME_FORCEIDLE, delta); } -#endif +#endif /* CONFIG_SCHED_CORE */ /* * When a guest is interrupted for a longer amount of time, missed clock @@ -262,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) return steal; } -#endif +#endif /* CONFIG_PARAVIRT */ return 0; } @@ -288,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t) { return t->se.sum_exec_runtime; } -#else +#else /* !CONFIG_64BIT: */ static u64 read_sum_exec_runtime(struct task_struct *t) { u64 ns; @@ -301,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) return ns; } -#endif +#endif /* !CONFIG_64BIT */ /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live @@ -411,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks) { irqtime_account_process_tick(current, 0, ticks); } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, int nr_ticks) { } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245e..0f30697ad7956 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,10 @@ */ #include +#include +#include +#include "sched.h" +#include "pelt.h" /* * Default limits for DL period; on the top end we guard against small util @@ -51,7 +55,7 @@ static int __init sched_dl_sysctl_init(void) return 0; } late_initcall(sched_dl_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static bool dl_server(struct sched_dl_entity *dl_se) { @@ -99,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return pi_of(dl_se) != dl_se; } -#else +#else /* !CONFIG_RT_MUTEXES: */ static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) { return dl_se; @@ -109,9 +113,8 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return false; } -#endif +#endif /* !CONFIG_RT_MUTEXES */ -#ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -191,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else -static inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - return 1; -} - -static inline unsigned long dl_bw_capacity(int i) -{ - return SCHED_CAPACITY_SCALE; -} - -bool dl_bw_visited(int cpu, u64 cookie) -{ - return false; -} - -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -552,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->root = RB_ROOT_CACHED; -#ifdef CONFIG_SMP /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; -#else - init_dl_bw(&dl_rq->dl_bw); -#endif dl_rq->running_bw = 0; dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } -#ifdef CONFIG_SMP - static inline int dl_overloaded(struct rq *rq) { return atomic_read(&rq->rd->dlo_count); @@ -753,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else - -static inline -void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline -void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline void deadline_queue_push_tasks(struct rq *rq) -{ -} - -static inline void deadline_queue_pull_task(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -1195,7 +1132,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) static void __push_dl_task(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -1209,7 +1145,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ @@ -1339,7 +1274,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } -#ifdef CONFIG_SMP if (unlikely(!rq->online)) { /* * If the runqueue is no longer available, migrate the @@ -1356,7 +1290,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1598,7 +1531,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } /* @@ -1844,8 +1777,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) #define __node_2_dle(node) \ rb_entry((node), struct sched_dl_entity, rb_node) -#ifdef CONFIG_SMP - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); @@ -1881,13 +1812,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else - -static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} - -#endif /* CONFIG_SMP */ - static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -2214,8 +2138,6 @@ static void yield_task_dl(struct rq *rq) rq_clock_skip_update(rq); } -#ifdef CONFIG_SMP - static inline bool dl_task_is_earliest_deadline(struct task_struct *p, struct rq *rq) { @@ -2345,7 +2267,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Only called when both the current and waking task are -deadline @@ -2359,7 +2280,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, return; } -#ifdef CONFIG_SMP /* * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... @@ -2367,7 +2287,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); -#endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK @@ -2375,11 +2294,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { hrtick_start(rq, dl_se->runtime); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { @@ -2500,8 +2419,6 @@ static void task_fork_dl(struct task_struct *p) */ } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define DL_MAX_TRIES 3 @@ -2995,8 +2912,6 @@ void dl_clear_root_domain_cpu(int cpu) dl_clear_root_domain(cpu_rq(cpu)->rd); } -#endif /* CONFIG_SMP */ - static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* @@ -3069,10 +2984,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } if (rq->donor != p) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); -#endif if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else @@ -3092,7 +3005,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!task_on_rq_queued(p)) return; -#ifdef CONFIG_SMP /* * This might be too much, but unfortunately * we don't have the old deadline value, and @@ -3121,13 +3033,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else - /* - * We don't know if p has a earlier or later deadline, so let's blindly - * set a (maybe not needed) rescheduling point. - */ - resched_curr(rq); -#endif } #ifdef CONFIG_SCHED_CORE @@ -3149,7 +3054,6 @@ DEFINE_SCHED_CLASS(dl) = { .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, -#ifdef CONFIG_SMP .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, @@ -3158,7 +3062,6 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif .task_tick = task_tick_dl, .task_fork = task_fork_dl, @@ -3458,7 +3361,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) return false; } -#ifdef CONFIG_SMP int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3570,7 +3472,6 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif void print_dl_stats(struct seq_file *m, int cpu) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf080751..f16d53928bb97 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,6 +6,9 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#include +#include +#include "sched.h" /* * This allows printing both to /sys/kernel/debug/sched/debug and @@ -90,10 +93,10 @@ static void sched_feat_enable(int i) { static_key_enable_cpuslocked(&sched_feat_keys[i]); } -#else +#else /* !CONFIG_JUMP_LABEL: */ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { @@ -166,8 +169,6 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -#ifdef CONFIG_SMP - static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -214,8 +215,6 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* SMP */ - #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -283,7 +282,6 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_verbose; -#ifdef CONFIG_SMP static struct dentry *sd_dentry; @@ -311,9 +309,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else -#define sched_verbose_write debugfs_write_file_bool -#endif static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -512,7 +507,6 @@ static __init int sched_init_debug(void) debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -#ifdef CONFIG_SMP debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -520,7 +514,6 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -530,7 +523,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); -#endif +#endif /* CONFIG_NUMA_BALANCING */ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); @@ -540,8 +533,6 @@ static __init int sched_init_debug(void) } late_initcall(sched_init_debug); -#ifdef CONFIG_SMP - static cpumask_var_t sd_sysctl_cpus; static int sd_flags_show(struct seq_file *m, void *v) @@ -652,8 +643,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -690,18 +679,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); -#ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif #undef PN_SCHEDSTAT #undef PN #undef P_SCHEDSTAT #undef P } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED static DEFINE_SPINLOCK(sched_debug_lock); @@ -854,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", @@ -874,8 +860,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -929,11 +914,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) PU(dl_nr_running); -#ifdef CONFIG_SMP dl_bw = &cpu_rq(cpu)->rd->dl_bw; -#else - dl_bw = &dl_rq->dl_bw; -#endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); @@ -951,9 +932,9 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } -#else +#else /* !CONFIG_X86: */ SEQ_printf(m, "cpu#%d\n", cpu); -#endif +#endif /* !CONFIG_X86 */ #define P(x) \ do { \ @@ -976,12 +957,10 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SMP #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1163,7 +1142,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); -#endif +#endif /* CONFIG_NUMA_BALANCING */ } void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, @@ -1251,7 +1230,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); -#ifdef CONFIG_SMP P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); @@ -1260,13 +1238,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); -#endif +#endif /* CONFIG_UCLAMP_TASK */ P(policy); P(prio); if (task_has_dl_policy(p)) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a14da5396fb2..7e2963efe800d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str) } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); -#ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. */ @@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif #ifdef CONFIG_CFS_BANDWIDTH /* @@ -162,7 +160,7 @@ static int __init sched_fair_sysctl_init(void) return 0; } late_initcall(sched_fair_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -471,7 +469,7 @@ static int se_is_idle(struct sched_entity *se) return cfs_rq_is_idle(group_cfs_rq(se)); } -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -517,7 +515,7 @@ static int se_is_idle(struct sched_entity *se) return task_has_idle_policy(task_of(se)); } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); @@ -996,7 +994,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1008,7 +1005,6 @@ int sched_update_scaling(void) return 0; } -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1041,7 +1037,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) } #include "pelt.h" -#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -1131,18 +1126,6 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP */ -void init_entity_runnable_average(struct sched_entity *se) -{ -} -void post_init_entity_util_avg(struct task_struct *p) -{ -} -static void update_tg_load_avg(struct cfs_rq *cfs_rq) -{ -} -#endif /* CONFIG_SMP */ - static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { u64 now = rq_clock_task(rq); @@ -2114,12 +2097,12 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline int numa_idle_core(int idle_core, int cpu) { return idle_core; } -#endif +#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -3673,7 +3656,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) p->numa_scan_period = task_scan_start(p); } -#else +#else /* !CONFIG_NUMA_BALANCING: */ + static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } @@ -3690,20 +3674,18 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) { } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); } -#endif cfs_rq->nr_queued++; } @@ -3711,12 +3693,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } -#endif cfs_rq->nr_queued--; } @@ -3768,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) -#ifdef CONFIG_SMP static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3785,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else -static inline void -enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -3822,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_load_set(&se->load, weight); -#ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); -#endif enqueue_load_avg(cfs_rq, se); if (se->on_rq) { @@ -3863,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -3970,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } -#endif /* CONFIG_SMP */ /* * Recomputes the group entity based on the current state of its group @@ -3991,20 +3960,16 @@ static void update_cfs_group(struct sched_entity *se) if (throttled_hierarchy(gcfs_rq)) return; -#ifndef CONFIG_SMP - shares = READ_ONCE(gcfs_rq->tg->shares); -#else shares = calc_group_shares(gcfs_rq); -#endif if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_cfs_group(struct sched_entity *se) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { @@ -4029,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } } -#ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { if (sa->load_sum) @@ -4481,7 +4445,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) return true; } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} @@ -4494,7 +4458,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_NO_HZ_COMMON static inline void migrate_se_pelt_lag(struct sched_entity *se) @@ -4575,9 +4539,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) __update_load_avg_blocked_se(now, se); } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static void migrate_se_pelt_lag(struct sched_entity *se) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages @@ -5144,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* CONFIG_SMP */ - -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - return !cfs_rq->nr_queued; -} - -#define UPDATE_TG 0x0 -#define SKIP_AGE_LOAD 0x0 -#define DO_ATTACH 0x0 -#define DO_DETACH 0x0 - -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) -{ - cfs_rq_util_change(cfs_rq, 0); -} - -static inline void remove_entity_load_avg(struct sched_entity *se) {} - -static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} - -static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) -{ - return 0; -} - -static inline void -util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} -static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - -#endif /* CONFIG_SMP */ - void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { struct sched_entity *se = &p->se; @@ -5253,7 +5175,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) - * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: @@ -5685,7 +5607,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ static bool cfs_bandwidth_used(void) { return true; @@ -5693,16 +5615,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* CONFIG_JUMP_LABEL */ - -/* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds - */ -static inline u64 default_cfs_period(void) -{ - return 100000000ULL; -} +#endif /* !CONFIG_JUMP_LABEL */ static inline u64 sched_cfs_bandwidth_slice(void) { @@ -6088,7 +6001,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -#ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; @@ -6147,12 +6059,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else -static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -{ - unthrottle_cfs_rq(cfs_rq); -} -#endif static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6490,8 +6396,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } -extern const u64 max_cfs_quota_period; - static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -6518,7 +6422,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) * to fail. */ new = old * 2; - if (new < max_cfs_quota_period) { + if (new < max_bw_quota_period_us * NSEC_PER_USEC) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; cfs_b->burst *= 2; @@ -6552,7 +6456,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->period = us_to_ktime(default_bw_period_us()); cfs_b->burst = 0; cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; @@ -6608,7 +6512,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * guaranteed at this point that no additional cfs_rq of this group can * join a CSD list. */ -#ifdef CONFIG_SMP for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); unsigned long flags; @@ -6620,7 +6523,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) __cfsb_csd_unthrottle(rq); local_irq_restore(flags); } -#endif } /* @@ -6733,9 +6635,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) if (cfs_task_bw_constrained(p)) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#endif +#endif /* CONFIG_NO_HZ_FULL */ -#else /* CONFIG_CFS_BANDWIDTH */ +#else /* !CONFIG_CFS_BANDWIDTH: */ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } @@ -6777,7 +6679,7 @@ bool cfs_task_bw_constrained(struct task_struct *p) return false; } #endif -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* !CONFIG_CFS_BANDWIDTH */ #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} @@ -6822,7 +6724,7 @@ static void hrtick_update(struct rq *rq) hrtick_start_fair(rq, donor); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -6831,9 +6733,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static inline void hrtick_update(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { unsigned long rq_util_min, rq_util_max; @@ -6875,9 +6776,6 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else -static inline void check_update_overutilized_status(struct rq *rq) { } -#endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -6886,12 +6784,10 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } -#endif static void requeue_delayed_entity(struct sched_entity *se) @@ -7206,8 +7102,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq) return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); } -#ifdef CONFIG_SMP - /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); @@ -7677,7 +7571,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT: */ static inline void set_idle_cores(int cpu, int val) { @@ -7698,7 +7592,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd return -1; } -#endif /* CONFIG_SCHED_SMT */ +#endif /* !CONFIG_SCHED_SMT */ /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by @@ -8743,9 +8637,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } -#else -static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { @@ -8939,7 +8830,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf return p; simple: -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ put_prev_set_next_task(rq, prev, p); return p; @@ -9055,7 +8946,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) return true; } -#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods. * @@ -9357,13 +9247,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return src_weight - dst_weight; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { return 0; } -#endif +#endif /* !CONFIG_NUMA_BALANCING */ /* * Check whether the task is ineligible on the destination cpu @@ -9772,12 +9662,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) if (!has_blocked) rq->has_blocked_load = 0; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -9886,7 +9776,7 @@ static unsigned long task_h_load(struct task_struct *p) return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1); } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -9903,7 +9793,7 @@ static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void sched_balance_update_blocked_averages(int cpu) { @@ -10616,7 +10506,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) return remote; return all; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { return all; @@ -10626,7 +10516,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) { return regular; } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ struct sg_lb_stats; @@ -12772,7 +12662,7 @@ static void nohz_newidle_balance(struct rq *this_rq) atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -12781,7 +12671,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * sched_balance_newidle is called by schedule() if this_cpu is about to become @@ -12978,8 +12868,6 @@ static void rq_offline_fair(struct rq *rq) clear_tg_offline_cfs_rqs(rq); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_SCHED_CORE static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) @@ -13076,10 +12964,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, cfs_rqa = sea->cfs_rq; cfs_rqb = seb->cfs_rq; -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ cfs_rqa = &task_rq(a)->cfs; cfs_rqb = &task_rq(b)->cfs; -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ /* * Find delta after normalizing se's vruntime with its cfs_rq's @@ -13103,9 +12991,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } -#else +#else /* !CONFIG_SCHED_CORE: */ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} -#endif +#endif /* !CONFIG_SCHED_CORE */ /* * scheduler tick hitting a task of our scheduling class. @@ -13199,15 +13087,14 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) list_add_leaf_cfs_rq(cfs_rq); } } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static void propagate_entity_cfs_rq(struct sched_entity *se) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_SMP /* * In case the task sched_avg hasn't been attached: * - A forked task which hasn't been woken up by wake_up_new_task(). @@ -13216,7 +13103,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) */ if (!se->avg.last_update_time) return; -#endif /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); @@ -13280,7 +13166,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs { struct sched_entity *se = &p->se; -#ifdef CONFIG_SMP if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13288,7 +13173,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs */ list_move(&se->group_node, &rq->cfs_tasks); } -#endif if (!first) return; @@ -13326,9 +13210,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13343,10 +13225,8 @@ static void task_change_group_fair(struct task_struct *p) detach_task_cfs_rq(p); -#ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; -#endif set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } @@ -13642,7 +13522,6 @@ DEFINE_SCHED_CLASS(fair) = { .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, -#ifdef CONFIG_SMP .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13652,7 +13531,6 @@ DEFINE_SCHED_CLASS(fair) = { .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_fair, -#endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -13715,7 +13593,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { -#ifdef CONFIG_SMP int i; for_each_possible_cpu(i) { @@ -13737,6 +13614,4 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* SMP */ - } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2c85c86b455f7..c39b089d4f09b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,6 +6,11 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ +#include +#include +#include +#include "sched.h" +#include "smp.h" /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; @@ -47,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) return 1; } __setup("hlt", cpu_idle_nopoll_setup); -#endif +#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */ static noinline int __cpuidle cpu_idle_poll(void) { @@ -95,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void) if (static_branch_unlikely(&arch_needs_tick_broadcast)) tick_broadcast_exit(); } -#else +#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */ static inline void cond_tick_broadcast_enter(void) { } static inline void cond_tick_broadcast_exit(void) { } -#endif +#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */ /** * default_idle_call - Default CPU idle routine. @@ -427,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state) * idle-task scheduling class. */ -#ifdef CONFIG_SMP static int select_task_rq_idle(struct task_struct *p, int cpu, int flags) { @@ -439,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif /* * Idle tasks are unconditionally rescheduled: @@ -526,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = { .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, -#ifdef CONFIG_SMP .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 93b038d48900a..a4cf17b1fab06 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,6 +7,8 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ +#include +#include "sched.h" enum hk_flags { HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2a..e3a1ce923f9fc 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,6 +6,8 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ +#include +#include "sched.h" /* * Global load-average calculations @@ -333,12 +335,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 809194cd779f4..62fba83b7bb19 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,6 +4,8 @@ * * membarrier system call */ +#include +#include "sched.h" /* * For documentation purposes, here are some membarrier ordering diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 7a8534a2deffd..fa83bbaf4f3e8 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -23,6 +23,7 @@ * Move PELT related code from fair.c into this pelt.c file * Author: Vincent Guittot */ +#include "pelt.h" /* * Approximate: @@ -413,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) return 0; } -#endif +#endif /* CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* @@ -466,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } -#endif +#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */ /* * Load avg and utiliztion metrics need to be updated periodically and before diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index f4f6a0875c66f..62c3fa543c0f2 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,4 +1,8 @@ -#ifdef CONFIG_SMP +// SPDX-License-Identifier: GPL-2.0 +#ifndef _KERNEL_SCHED_PELT_H +#define _KERNEL_SCHED_PELT_H +#include "sched.h" + #include "sched-pelt.h" int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); @@ -15,7 +19,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return READ_ONCE(rq->avg_hw.load_avg); } -#else +#else /* !CONFIG_SCHED_HW_PRESSURE: */ static inline int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { @@ -26,7 +30,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return 0; } -#endif +#endif /* !CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); @@ -174,63 +178,12 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } -#else +#else /* !CONFIG_CFS_BANDWIDTH: */ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); } -#endif - -#else - -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int -update_rt_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_dl_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) -{ - return 0; -} - -static inline u64 hw_load_avg(struct rq *rq) -{ - return 0; -} - -static inline int -update_irq_load_avg(struct rq *rq, u64 running) -{ - return 0; -} - -static inline u64 rq_clock_pelt(struct rq *rq) -{ - return rq_clock_task(rq); -} - -static inline void -update_rq_clock_pelt(struct rq *rq, s64 delta) { } - -static inline void -update_idle_rq_clock_pelt(struct rq *rq) { } - -static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif - +#endif /* !CONFIG_CFS_BANDWIDTH */ +#endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ad04a5c3162a2..5837cd8e7b97e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,6 +136,10 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ +#include +#include +#include +#include "sched.h" static int psi_bug __read_mostly; @@ -1035,7 +1039,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st psi_schedule_rtpoll_work(group, 1, false); } while ((group = group->parent)); } -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -1651,7 +1655,7 @@ static const struct proc_ops psi_irq_proc_ops = { .proc_poll = psi_fop_poll, .proc_release = psi_fop_release, }; -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ static int __init psi_proc_init(void) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40422c370335..15d5855c542cb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,6 +4,9 @@ * policies) */ +#include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; @@ -60,7 +63,7 @@ static int __init sched_rt_sysctl_init(void) return 0; } late_initcall(sched_rt_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ void init_rt_rq(struct rt_rq *rt_rq) { @@ -75,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -291,7 +292,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) return 0; } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #define rt_entity_is_task(rt_se) (1) @@ -327,9 +328,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_SMP +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { @@ -430,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else - -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void rt_queue_push_tasks(struct rq *rq) -{ -} -#endif /* CONFIG_SMP */ - static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -485,12 +469,12 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) return cpu_cap >= min(min_cap, max_cap); } -#else +#else /* !CONFIG_UCLAMP_TASK: */ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { return true; } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_RT_GROUP_SCHED @@ -594,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { return this_rq()->rd->span; } -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) @@ -625,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) rt_rq->rt_time < rt_b->rt_runtime); } -#ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. */ @@ -798,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP */ -static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -930,7 +903,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ typedef struct rt_rq *rt_rq_iter_t; @@ -977,12 +950,10 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return &cpu_rq(cpu)->rt; } -#ifdef CONFIG_SMP static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } -#endif -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline int rt_se_prio(struct sched_rt_entity *rt_se) { @@ -1033,7 +1004,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } static void @@ -1075,8 +1046,6 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) cpufreq_update_util(rq, 0); } -#if defined CONFIG_SMP - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { @@ -1107,16 +1076,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* CONFIG_SMP */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void inc_rt_prio(struct rt_rq *rt_rq, int prio) { @@ -1155,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED static void @@ -1182,7 +1134,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) @@ -1192,7 +1144,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) @@ -1538,7 +1490,6 @@ static void yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr, 0); } -#ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); static int @@ -1653,7 +1604,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Preempt the current task with a newly woken task if needed: @@ -1667,7 +1617,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) return; } -#ifdef CONFIG_SMP /* * If: * @@ -1682,7 +1631,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -1776,8 +1724,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s enqueue_pushable_task(rq, p); } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define RT_MAX_TRIES 3 @@ -2451,7 +2397,6 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#endif /* CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue @@ -2475,10 +2420,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * then see if we can move to another run queue. */ if (task_on_rq_queued(p)) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); -#endif /* CONFIG_SMP */ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } @@ -2495,7 +2438,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) return; if (task_current_donor(rq, p)) { -#ifdef CONFIG_SMP /* * If our priority decreases while running, we * may need to pull tasks to this runqueue. @@ -2509,11 +2451,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_curr(rq); -#endif /* CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2549,9 +2486,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) } } } -#else +#else /* !CONFIG_POSIX_TIMERS: */ static inline void watchdog(struct rq *rq, struct task_struct *p) { } -#endif +#endif /* !CONFIG_POSIX_TIMERS */ /* * scheduler tick hitting a task of our scheduling class. @@ -2620,7 +2557,7 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) return rt_rq_throttled(rt_rq); } -#endif +#endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { @@ -2634,7 +2571,6 @@ DEFINE_SCHED_CLASS(rt) = { .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, -#ifdef CONFIG_SMP .balance = balance_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, @@ -2643,7 +2579,6 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif .task_tick = task_tick_rt, @@ -2887,7 +2822,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) @@ -2895,7 +2830,7 @@ static int sched_rt_global_constraints(void) return 0; } #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index c529706bed115..6803cfec7a1e1 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ +#include static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 475bb5998295e..105190b180203 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -401,6 +402,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; +#ifdef CONFIG_CFS_BANDWIDTH +extern const u64 max_bw_quota_period_us; + +/* + * default period for group bandwidth. + * default: 0.1s, units: microseconds + */ +static inline u64 default_bw_period_us(void) +{ + return 100000ULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ + struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; @@ -424,7 +438,7 @@ struct cfs_bandwidth { int nr_burst; u64 throttled_time; u64 burst_time; -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ }; /* Task group related information */ @@ -442,15 +456,13 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; @@ -531,7 +543,7 @@ extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void free_fair_sched_group(struct task_group *tg) { } static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { @@ -539,7 +551,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou } static inline void online_fair_sched_group(struct task_group *tg) { } static inline void unregister_fair_sched_group(struct task_group *tg) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -573,25 +585,20 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); -#ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP */ -static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* CONFIG_SMP */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ struct cfs_bandwidth { }; static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); @@ -667,7 +674,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SMP /* * CFS load tracking */ @@ -699,7 +705,6 @@ struct cfs_rq { u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ @@ -796,19 +801,13 @@ struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; unsigned int rr_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP int next; /* next highest */ -#endif } highest_prio; -#endif -#ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; -#endif /* CONFIG_SMP */ int rt_queued; #ifdef CONFIG_RT_GROUP_SCHED @@ -839,7 +838,6 @@ struct dl_rq { unsigned int dl_nr_running; -#ifdef CONFIG_SMP /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates @@ -859,9 +857,7 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else - struct dl_bw dl_bw; -#endif + /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -932,7 +928,6 @@ static inline long se_runnable(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * XXX we want to get rid of these helpers and use the full load resolution. */ @@ -1008,7 +1003,7 @@ struct root_domain { /* These atomics are updated outside of a lock */ atomic_t rto_loop_next; atomic_t rto_loop_start; -#endif +#endif /* HAVE_RT_PUSH_IPI */ /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -1043,7 +1038,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status) #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK /* @@ -1107,18 +1101,14 @@ struct rq { unsigned int numa_migrate_on; #endif #ifdef CONFIG_NO_HZ_COMMON -#ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; call_single_data_t nohz_csd; -#endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_SMP unsigned int ttwu_pending; -#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -1183,7 +1173,6 @@ struct rq { int membarrier_state; #endif -#ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1224,7 +1213,6 @@ struct rq { #ifdef CONFIG_HOTPLUG_CPU struct rcuwait hotplug_wait; #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1242,9 +1230,7 @@ struct rq { long calc_load_active; #ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP call_single_data_t hrtick_csd; -#endif struct hrtimer hrtick_timer; ktime_t hrtick_time; #endif @@ -1271,9 +1257,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#ifdef CONFIG_SMP unsigned int nr_pinned; -#endif unsigned int push_busy; struct cpu_stop_work push_work; @@ -1294,12 +1278,12 @@ struct rq { unsigned int core_forceidle_seq; unsigned int core_forceidle_occupation; u64 core_forceidle_start; -#endif +#endif /* CONFIG_SCHED_CORE */ /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; -#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) +#ifdef CONFIG_CFS_BANDWIDTH call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif @@ -1313,32 +1297,24 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { return container_of(cfs_rq, struct rq, cfs); } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline int cpu_of(struct rq *rq) { -#ifdef CONFIG_SMP return rq->cpu; -#else - return 0; -#endif } #define MDF_PUSH 0x01 static inline bool is_migration_disabled(struct task_struct *p) { -#ifdef CONFIG_SMP return p->migration_disabled; -#else - return false; -#endif } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1500,6 +1476,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ + #ifdef CONFIG_RT_GROUP_SCHED # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED DECLARE_STATIC_KEY_FALSE(rt_group_sched); @@ -1507,16 +1484,16 @@ static inline bool rt_group_sched_enabled(void) { return static_branch_unlikely(&rt_group_sched); } -# else +# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */ DECLARE_STATIC_KEY_TRUE(rt_group_sched); static inline bool rt_group_sched_enabled(void) { return static_branch_likely(&rt_group_sched); } -# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ -#else +# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ # define rt_group_sched_enabled() false -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1574,9 +1551,9 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline void update_idle_core(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_SMT */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1757,7 +1734,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq) WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); } -#else /* !CONFIG_SCHED_CLASS_EXT */ +#else /* !CONFIG_SCHED_CLASS_EXT: */ #define scx_enabled() false #define scx_switched_all() false @@ -1781,9 +1758,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -1961,8 +1936,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_SMP - static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2128,8 +2101,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2174,7 +2145,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } #else /* !CONFIG_CGROUP_SCHED: */ @@ -2200,7 +2171,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; -#endif +#endif /* CONFIG_SMP */ } /* @@ -2280,11 +2251,7 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_SMP return p->on_cpu; -#else - return task_current(rq, p); -#endif } static inline int task_on_rq_queued(struct task_struct *p) @@ -2307,11 +2274,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ -#ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); static_assert(WF_FORK == SD_BALANCE_FORK); static_assert(WF_TTWU == SD_BALANCE_WAKE); -#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -2367,11 +2332,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x10 #define ENQUEUE_REPLENISH 0x20 -#ifdef CONFIG_SMP #define ENQUEUE_MIGRATED 0x40 -#else -#define ENQUEUE_MIGRATED 0x00 -#endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 @@ -2416,7 +2377,6 @@ struct sched_class { void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); -#ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -2429,7 +2389,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); @@ -2581,8 +2540,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_MIGRATE_ENABLE 0x04 #define SCA_USER 0x08 -#ifdef CONFIG_SMP - extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void sched_balance_trigger(struct rq *rq); @@ -2634,26 +2591,6 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#else /* !CONFIG_SMP: */ - -static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -{ - return true; -} - -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_CPU_IDLE static inline void idle_set_state(struct rq *rq, @@ -2749,10 +2686,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) call_trace_sched_update_nr_running(rq, count); } -#ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) set_rd_overloaded(rq->rd, 1); -#endif sched_update_tick_dependency(rq); } @@ -2918,10 +2853,7 @@ unsigned long arch_scale_freq_capacity(int cpu) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); - /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ -#ifdef CONFIG_SMP rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); -#endif } #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ @@ -2930,8 +2862,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } -#ifdef CONFIG_SMP - static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { #ifdef CONFIG_SCHED_CORE @@ -2954,7 +2884,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) /* * __sched_core_flip() relies on SMT having cpu-id lock order. */ -#endif +#endif /* CONFIG_SCHED_CORE */ return rq1->cpu < rq2->cpu; } @@ -3091,42 +3021,6 @@ extern void set_rq_offline(struct rq *rq); extern bool sched_smp_initialized; -#else /* !CONFIG_SMP: */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - WARN_ON_ONCE(!irqs_disabled()); - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_lock(rq1); - __acquire(rq2->lock); /* Fake it out ;) */ - double_rq_clock_clear_update(rq1, rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_unlock(rq1); - __release(rq2->lock); -} - -#endif /* !CONFIG_SMP */ - DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) @@ -3145,6 +3039,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); + #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void @@ -3184,7 +3079,7 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif /* !CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_run_idle_balance(int cpu); #else static inline void nohz_run_idle_balance(int cpu) { } @@ -3254,14 +3149,14 @@ static inline u64 irq_time_read(int cpu) return total; } -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline int irqtime_enabled(void) { return 0; } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3310,8 +3205,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_SMP - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3355,10 +3248,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP */ -static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -3535,13 +3424,13 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ #define perf_domain_span(pd) NULL static inline bool sched_energy_enabled(void) { return false; } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #ifdef CONFIG_MEMBARRIER @@ -3567,7 +3456,7 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else /* !CONFIG_MEMBARRIER :*/ +#else /* !CONFIG_MEMBARRIER: */ static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, @@ -3577,7 +3466,6 @@ static inline void membarrier_switch_mm(struct rq *rq, #endif /* !CONFIG_MEMBARRIER */ -#ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) { if (!(p->flags & PF_KTHREAD)) @@ -3588,7 +3476,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3887,7 +3774,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); -#ifdef CONFIG_SMP static inline void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) { @@ -3908,7 +3794,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif #ifdef CONFIG_RT_MUTEXES @@ -3949,21 +3834,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio); -#ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else - -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} - -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - -#endif #ifdef CONFIG_SCHED_CLASS_EXT /* diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 21ac44428bb02..7f151d96dba96 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_SCHED_SMP_H +#define _KERNEL_SCHED_SMP_H + /* * Scheduler internal SMP callback types and methods between the scheduler * and other internal parts of the core kernel: */ +#include extern void sched_ttwu_pending(void *arg); @@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void); #else static inline void flush_smp_call_function_queue(void) { } #endif + +#endif /* _KERNEL_SCHED_SMP_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 4346fd81c31fd..d1c9429a4ac5a 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,6 +2,7 @@ /* * /proc/schedstat implementation */ +#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) @@ -114,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "timestamp %lu\n", jiffies); } else { struct rq *rq; -#ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; -#endif cpu = (unsigned long)(v - 2); rq = cpu_rq(cpu); @@ -132,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); -#ifdef CONFIG_SMP /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -163,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 452826df6ae1e..26f3fd4d34cea 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); } -#else /* CONFIG_PSI */ +#else /* !CONFIG_PSI: */ static inline void psi_enqueue(struct task_struct *p, bool migrate) {} static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} @@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev, bool sleep) {} static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /* CONFIG_PSI */ +#endif /* !CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO /* @@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_enqueue(rq, t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHED_INFO */ +#endif /* !CONFIG_SCHED_INFO */ #endif /* _KERNEL_STATS_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 058dd42e3d9b5..2d4e279f05ee9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,8 +7,8 @@ * * See kernel/stop_machine.c */ +#include "sched.h" -#ifdef CONFIG_SMP static int select_task_rq_stop(struct task_struct *p, int cpu, int flags) { @@ -20,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return sched_stop_runnable(rq); } -#endif /* CONFIG_SMP */ static void wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) @@ -106,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = { .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, -#ifdef CONFIG_SMP .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_stop, diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 72505cd3b60a3..0fef6496c4c83 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,6 +2,7 @@ /* * (simple wait queues ) implementation: */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 547c1f05b667e..77ae87f36e841 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment) return 0; } -#endif +#endif /* __ARCH_WANT_SYS_NICE */ /** * task_prio - return the priority value of a given task. @@ -209,10 +209,8 @@ int idle_cpu(int cpu) if (rq->nr_running) return 0; -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -255,8 +253,7 @@ int sched_core_idle_cpu(int cpu) return idle_cpu(cpu); } - -#endif +#endif /* CONFIG_SCHED_CORE */ /** * find_process_by_pid - find a process with a matching PID value. @@ -448,7 +445,7 @@ static inline int uclamp_validate(struct task_struct *p, } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ /* * Allow unprivileged RT tasks to decrease priority. @@ -642,7 +639,6 @@ int __sched_setscheduler(struct task_struct *p, goto unlock; } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; @@ -658,7 +654,6 @@ int __sched_setscheduler(struct task_struct *p, goto unlock; } } -#endif } /* Re-check policy now with rq lock held: */ @@ -1120,7 +1115,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } -#ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { /* @@ -1149,7 +1143,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; } -#endif /* CONFIG_SMP */ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { @@ -1242,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); if (user_mask) { cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { + } else { return -ENOMEM; } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b958fe48e0205..8e06b1d22e91e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ +#include #include +#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); void sched_domains_mutex_lock(void) @@ -313,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void) } late_initcall(sched_energy_aware_sysctl_init); -#endif +#endif /* CONFIG_PROC_SYSCTL */ static void free_pd(struct perf_domain *pd) { @@ -449,9 +451,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map) return false; } -#else +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ static void free_rootdomain(struct rcu_head *rcu) { @@ -1318,8 +1320,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) update_group_capacity(sd, cpu); } -#ifdef CONFIG_SMP - /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) { @@ -1374,8 +1374,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) } } -#endif /* CONFIG_SMP */ - /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same @@ -1598,7 +1596,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -#endif +#endif /* CONFIG_NUMA */ /* * SD_flags allowed in topology descriptions. @@ -1714,7 +1712,7 @@ sd_init(struct sched_domain_topology_level *tl, SD_WAKE_AFFINE); } -#endif +#endif /* CONFIG_NUMA */ } else { sd->cache_nice_tries = 1; } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f47018..a6f00012c72e1 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,6 +4,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index b410b61cec959..1088d3b7012cf 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include +#include "sched.h" + /* * The implementation of the wait_bit*() and related waiting APIs: */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738c..7daf2da09e8e1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -741,6 +741,26 @@ seccomp_prepare_user_filter(const char __user *user_filter) } #ifdef SECCOMP_ARCH_NATIVE +static bool seccomp_uprobe_exception(struct seccomp_data *sd) +{ +#if defined __NR_uretprobe || defined __NR_uprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + { +#ifdef __NR_uretprobe + if (sd->nr == __NR_uretprobe) + return true; +#endif +#ifdef __NR_uprobe + if (sd->nr == __NR_uprobe) + return true; +#endif + } +#endif + return false; +} + /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs @@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, return false; /* Our single exception to filtering. */ -#ifdef __NR_uretprobe -#ifdef SECCOMP_ARCH_COMPAT - if (sd->arch == SECCOMP_ARCH_NATIVE) -#endif - if (sd->nr == __NR_uretprobe) - return true; -#endif + if (seccomp_uprobe_exception(sd)) + return true; for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; @@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, #ifdef __NR_uretprobe __NR_uretprobe, +#endif +#ifdef __NR_uprobe + __NR_uprobe, #endif -1, /* negative terminated */ }; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1992b62e980b7..4503b60ce9bd2 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -18,8 +18,6 @@ #include "smpboot.h" -#ifdef CONFIG_SMP - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -76,8 +74,6 @@ void __init idle_threads_init(void) } #endif -#endif /* #ifdef CONFIG_SMP */ - static LIST_HEAD(hotplug_threads); static DEFINE_MUTEX(smpboot_threads_lock); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c6..bf5d05c635ffd 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a8bc7da90626..e400fe150f9d7 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_first(cpu_online_mask); - if (cpu == smp_processor_id()) - cpu = cpumask_next(cpu, cpu_online_mask); + cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpumask_set_cpu(cpu, &cpus_chosen); @@ -589,9 +587,7 @@ static void clocksource_watchdog(struct timer_list *unused) * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); /* * Arm timer if not already pending: could race with concurrent diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 2f6330831f084..c0c54dc5314c3 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1405,23 +1405,20 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; do { - if (!tmigr_check_lonely(group)) { + if (!tmigr_check_lonely(group)) return KTIME_MAX; - } else { - /* - * Since current CPU is active, events may not be sorted - * from bottom to the top because the CPU's event is ignored - * up to the top and its sibling's events not propagated upwards. - * Thus keep track of the lowest observed expiry. - */ - nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); - if (!group->parent) - return nextevt; - } + + /* + * Since current CPU is active, events may not be sorted + * from bottom to the top because the CPU's event is ignored + * up to the top and its sibling's events not propagated upwards. + * Thus keep track of the lowest observed expiry. + */ + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); group = group->parent; } while (group); - return KTIME_MAX; + return nextevt; } /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3f6a7bdc6edf6..5401b90061355 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; + size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } @@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = { .llseek = noop_llseek, }; -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); @@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, } static const struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; @@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } bt->dev = dev; - atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 75af12ff774e7..41c299e63fab7 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -186,6 +186,29 @@ void watchdog_hardlockup_disable(unsigned int cpu) } } +/** + * hardlockup_detector_perf_adjust_period - Adjust the event period due + * to cpu frequency change + * @cpu: The CPU whose event period will be adjusted + * @period: The target period to be set + */ +void hardlockup_detector_perf_adjust_period(int cpu, u64 period) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) + return; + + if (!event) + return; + + if (event->attr.sample_period == period) + return; + + if (perf_event_period(event, period)) + pr_err("failed to change period to %llu\n", period); +} + /** * hardlockup_detector_perf_stop - Globally stop watchdog events * diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ebe33181b6e6e..ef00752a2b67e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -206,6 +206,16 @@ config DEBUG_BUGVERBOSE of the BUG call as well as the EIP and oops trace. This aids debugging but costs about 70-100K of memory. +config DEBUG_BUGVERBOSE_DETAILED + bool "Verbose WARN_ON_ONCE() reporting (adds 100K)" if DEBUG_BUGVERBOSE + help + Say Y here to make WARN_ON_ONCE() output the condition string of the + warning, in addition to the file name and line number. + This helps debugging, but costs about 100K of memory. + + Say N if unsure. + + endmenu # "printk and dmesg options" config DEBUG_KERNEL diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index d48b80f3f007e..0142bc916f73b 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -10,6 +10,7 @@ #include #include #include +#include #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) @@ -134,6 +135,9 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl struct codetag_bytes n; unsigned int i, nr = 0; + if (IS_ERR_OR_NULL(alloc_tag_cttype)) + return 0; + if (can_sleep) codetag_lock_module_list(alloc_tag_cttype, true); else if (!codetag_trylock_module_list(alloc_tag_cttype)) @@ -632,8 +636,13 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag mod->name); return -ENOMEM; } - } + /* + * Avoid a kmemleak false positive. The pointer to the counters is stored + * in the alloc_tag section of the module and cannot be directly accessed. + */ + kmemleak_ignore_percpu(tag->counters); + } return 0; } diff --git a/lib/group_cpus.c b/lib/group_cpus.c index ee272c4cefcc1..18d43a406114b 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -352,6 +352,9 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) int ret = -ENOMEM; struct cpumask *masks = NULL; + if (numgrps == 0) + return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -426,8 +429,12 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) #else /* CONFIG_SMP */ struct cpumask *group_cpus_evenly(unsigned int numgrps) { - struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + struct cpumask *masks; + if (numgrps == 0) + return NULL; + + masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); if (!masks) return NULL; diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 00524e55a21e0..ef66be963798e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5319,6 +5319,7 @@ static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, struct maple_enode *start; if (mte_is_leaf(enode)) { + mte_set_node_dead(enode); node->type = mte_node_type(enode); goto free_leaf; } diff --git a/lib/math/div64.c b/lib/math/div64.c index 5faa29208bdb4..bf77b9843175e 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -212,12 +212,13 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) #endif - /* make sure c is not zero, trigger exception otherwise */ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdiv-by-zero" - if (unlikely(c == 0)) - return 1/0; -#pragma GCC diagnostic pop + /* make sure c is not zero, trigger runtime exception otherwise */ + if (unlikely(c == 0)) { + unsigned long zero = 0; + + OPTIMIZER_HIDE_VAR(zero); + return ~0UL/zero; + } int shift = __builtin_ctzll(c); diff --git a/lib/math/gcd.c b/lib/math/gcd.c index e3b042214d1b7..62efca6787aef 100644 --- a/lib/math/gcd.c +++ b/lib/math/gcd.c @@ -11,22 +11,16 @@ * has decent hardware division. */ +DEFINE_STATIC_KEY_TRUE(efficient_ffs_key); + #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) /* If __ffs is available, the even/odd algorithm benchmarks slower. */ -/** - * gcd - calculate and return the greatest common divisor of 2 unsigned longs - * @a: first value - * @b: second value - */ -unsigned long gcd(unsigned long a, unsigned long b) +static unsigned long binary_gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; - if (!a || !b) - return r; - b >>= __ffs(b); if (b == 1) return r & -r; @@ -44,9 +38,15 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#else +#endif /* If normalization is done by loops, the even/odd algorithm is a win. */ + +/** + * gcd - calculate and return the greatest common divisor of 2 unsigned longs + * @a: first value + * @b: second value + */ unsigned long gcd(unsigned long a, unsigned long b) { unsigned long r = a | b; @@ -54,6 +54,11 @@ unsigned long gcd(unsigned long a, unsigned long b) if (!a || !b) return r; +#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) + if (static_branch_likely(&efficient_ffs_key)) + return binary_gcd(a, b); +#endif + /* Isolate lsbit of r */ r &= -r; @@ -80,6 +85,4 @@ unsigned long gcd(unsigned long a, unsigned long b) } } -#endif - EXPORT_SYMBOL_GPL(gcd); diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 75ce3e134b7c7..799e0e5eac26d 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,9 +18,6 @@ #else #include #include -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -EXPORT_SYMBOL(raid6_empty_zero_page); #endif struct raid6_calls raid6_call; diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c index a7c1b2bbe40d8..b5e47c008b41b 100644 --- a/lib/raid6/recov.c +++ b/lib/raid6/recov.c @@ -31,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -72,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c index 4e8095403ee28..97d598d2535ca 100644 --- a/lib/raid6/recov_avx2.c +++ b/lib/raid6/recov_avx2.c @@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c index 310c715db3134..7986120ca4442 100644 --- a/lib/raid6/recov_avx512.c +++ b/lib/raid6/recov_avx512.c @@ -37,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -238,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c index 94aeac85e6f75..93dc515997a14 100644 --- a/lib/raid6/recov_loongarch_simd.c +++ b/lib/raid6/recov_loongarch_simd.c @@ -42,10 +42,10 @@ static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -197,7 +197,7 @@ static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -316,10 +316,10 @@ static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -436,7 +436,7 @@ static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c index 1bfc14174d4dd..70e1404c15129 100644 --- a/lib/raid6/recov_neon.c +++ b/lib/raid6/recov_neon.c @@ -36,10 +36,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -74,7 +74,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_rvv.c b/lib/raid6/recov_rvv.c index f29303795ccfe..5d54c4b437df7 100644 --- a/lib/raid6/recov_rvv.c +++ b/lib/raid6/recov_rvv.c @@ -165,10 +165,10 @@ static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila, * delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -203,7 +203,7 @@ static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila, * Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks - 1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c index 179eec900cea4..1d32c01261be0 100644 --- a/lib/raid6/recov_s390xc.c +++ b/lib/raid6/recov_s390xc.c @@ -35,10 +35,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -82,7 +82,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c index 4bfa3c6b60de7..2e849185c32b3 100644 --- a/lib/raid6/recov_ssse3.c +++ b/lib/raid6/recov_ssse3.c @@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, Use the dead data pages as temporary storage for delta p and delta q */ dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-2] = dp; dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[failb] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); @@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, /* Compute syndrome with zero for the missing data page Use the dead data page as temporary storage for delta q */ dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[faila] = raid6_get_zero_page(); ptrs[disks-1] = dq; raid6_call.gen_syndrome(disks, bytes, ptrs); diff --git a/lib/test_objagg.c b/lib/test_objagg.c index d34df4306b874..a67b8ef5c5beb 100644 --- a/lib/test_objagg.c +++ b/lib/test_objagg.c @@ -906,50 +906,22 @@ static int check_expect_hints_stats(struct objagg_hints *objagg_hints, return err; } -static int test_hints_case(const struct hints_case *hints_case) +static int test_hints_case2(const struct hints_case *hints_case, + struct objagg_hints *hints, struct objagg *objagg) { struct objagg_obj *objagg_obj; - struct objagg_hints *hints; struct world world2 = {}; - struct world world = {}; struct objagg *objagg2; - struct objagg *objagg; const char *errmsg; int i; int err; - objagg = objagg_create(&delta_ops, NULL, &world); - if (IS_ERR(objagg)) - return PTR_ERR(objagg); - - for (i = 0; i < hints_case->key_ids_count; i++) { - objagg_obj = world_obj_get(&world, objagg, - hints_case->key_ids[i]); - if (IS_ERR(objagg_obj)) { - err = PTR_ERR(objagg_obj); - goto err_world_obj_get; - } - } - - pr_debug_stats(objagg); - err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg); - if (err) { - pr_err("Stats: %s\n", errmsg); - goto err_check_expect_stats; - } - - hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY); - if (IS_ERR(hints)) { - err = PTR_ERR(hints); - goto err_hints_get; - } - pr_debug_hints_stats(hints); err = check_expect_hints_stats(hints, &hints_case->expect_stats_hints, &errmsg); if (err) { pr_err("Hints stats: %s\n", errmsg); - goto err_check_expect_hints_stats; + return err; } objagg2 = objagg_create(&delta_ops, hints, &world2); @@ -981,7 +953,48 @@ static int test_hints_case(const struct hints_case *hints_case) world_obj_put(&world2, objagg, hints_case->key_ids[i]); i = hints_case->key_ids_count; objagg_destroy(objagg2); -err_check_expect_hints_stats: + + return err; +} + +static int test_hints_case(const struct hints_case *hints_case) +{ + struct objagg_obj *objagg_obj; + struct objagg_hints *hints; + struct world world = {}; + struct objagg *objagg; + const char *errmsg; + int i; + int err; + + objagg = objagg_create(&delta_ops, NULL, &world); + if (IS_ERR(objagg)) + return PTR_ERR(objagg); + + for (i = 0; i < hints_case->key_ids_count; i++) { + objagg_obj = world_obj_get(&world, objagg, + hints_case->key_ids[i]); + if (IS_ERR(objagg_obj)) { + err = PTR_ERR(objagg_obj); + goto err_world_obj_get; + } + } + + pr_debug_stats(objagg); + err = check_expect_stats(objagg, &hints_case->expect_stats, &errmsg); + if (err) { + pr_err("Stats: %s\n", errmsg); + goto err_check_expect_stats; + } + + hints = objagg_hints_get(objagg, OBJAGG_OPT_ALGO_SIMPLE_GREEDY); + if (IS_ERR(hints)) { + err = PTR_ERR(hints); + goto err_hints_get; + } + + err = test_hints_case2(hints_case, hints, objagg); + objagg_hints_put(hints); err_hints_get: err_check_expect_stats: diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 0f6c9e1fec0bf..30ae7518ffbf6 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -472,6 +472,7 @@ static ssize_t memcg_path_store(struct kobject *kobj, return -ENOMEM; strscpy(path, buf, count + 1); + kfree(filter->memcg_path); filter->memcg_path = path; return count; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8746ed2fec135..2d66408956568 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2340,12 +2340,15 @@ struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, struct folio *folio; spin_lock_irq(&hugetlb_lock); + if (WARN_ON_ONCE(!h->resv_huge_pages)) { + spin_unlock_irq(&hugetlb_lock); + return NULL; + } + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, nmask); - if (folio) { - VM_BUG_ON(!h->resv_huge_pages); + if (folio) h->resv_huge_pages--; - } spin_unlock_irq(&hugetlb_lock); return folio; @@ -2787,20 +2790,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one - * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, - struct folio *old_folio, struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio, + struct list_head *list) { - gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + gfp_t gfp_mask; + struct hstate *h; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: + /* + * The old_folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* @@ -2829,8 +2836,10 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, cond_resched(); goto retry; } else { + h = folio_hstate(old_folio); if (!new_folio) { spin_unlock_irq(&hugetlb_lock); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) @@ -2874,35 +2883,24 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { - struct hstate *h; int ret = -EBUSY; - /* - * The page might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - * Return success when racing as if we dissolved the page ourselves. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (!folio_test_hugetlb(folio)) return 0; - } - spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (hstate_is_gigantic(h)) + if (folio_order(folio) > MAX_PAGE_ORDER) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) - ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); + ret = alloc_and_dissolve_hugetlb_folio(folio, list); return ret; } @@ -2916,7 +2914,6 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { - struct hstate *h; struct folio *folio; int ret = 0; @@ -2925,23 +2922,9 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); - /* - * The folio might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); - start_pfn++; - continue; - } - spin_unlock_irq(&hugetlb_lock); - - if (!folio_ref_count(folio)) { - ret = alloc_and_dissolve_hugetlb_folio(h, folio, - &isolate_list); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); if (ret) break; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index da9cee34ee1b8..8d588e6853110 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1246,6 +1246,20 @@ void __ref kmemleak_transient_leak(const void *ptr) } EXPORT_SYMBOL(kmemleak_transient_leak); +/** + * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu + * address argument + * @ptr: percpu address of the object + */ +void __ref kmemleak_ignore_percpu(const void __percpu *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + make_black_object((unsigned long)ptr, OBJECT_PERCPU); +} +EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu); + /** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ab986dd09b6aa..6dbcdceecae13 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -514,6 +514,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { + int err = 0; pte_t *pte; /* @@ -530,12 +531,18 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, do { struct page *page = pages[*nr]; - if (WARN_ON(!pte_none(ptep_get(pte)))) - return -EBUSY; - if (WARN_ON(!page)) - return -ENOMEM; - if (WARN_ON(!pfn_valid(page_to_pfn(page)))) - return -EINVAL; + if (WARN_ON(!pte_none(ptep_get(pte)))) { + err = -EBUSY; + break; + } + if (WARN_ON(!page)) { + err = -ENOMEM; + break; + } + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) { + err = -EINVAL; + break; + } set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; @@ -543,7 +550,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, arch_leave_lazy_mmu_mode(); *mask |= PGTBL_PTE_MODIFIED; - return 0; + + return err; } static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 664f7b7a622c2..489b74d52abe7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3502,9 +3502,10 @@ sub process { # Check for various typo / spelling mistakes if (defined($misspellings) && ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) { - while ($rawline =~ /(?:^|[^\w\-'`])($misspellings)(?:[^\w\-'`]|$)/gi) { + my $rawline_utf8 = decode("utf8", $rawline); + while ($rawline_utf8 =~ /(?:^|[^\w\-'`])($misspellings)(?:[^\w\-'`]|$)/gi) { my $typo = $1; - my $blank = copy_spacing($rawline); + my $blank = copy_spacing($rawline_utf8); my $ptr = substr($blank, 0, $-[1]) . "^" x length($typo); my $hereptr = "$hereline$ptr\n"; my $typo_fix = $spelling_fix{lc($typo)}; diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in index fd6bd69c5096a..c3886739a0289 100644 --- a/scripts/gdb/linux/constants.py.in +++ b/scripts/gdb/linux/constants.py.in @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -73,12 +74,12 @@ if IS_BUILTIN(CONFIG_MODULES): LX_GDBPARSED(MOD_RO_AFTER_INIT) /* linux/mount.h */ -LX_VALUE(MNT_NOSUID) -LX_VALUE(MNT_NODEV) -LX_VALUE(MNT_NOEXEC) -LX_VALUE(MNT_NOATIME) -LX_VALUE(MNT_NODIRATIME) -LX_VALUE(MNT_RELATIME) +LX_GDBPARSED(MNT_NOSUID) +LX_GDBPARSED(MNT_NODEV) +LX_GDBPARSED(MNT_NOEXEC) +LX_GDBPARSED(MNT_NOATIME) +LX_GDBPARSED(MNT_NODIRATIME) +LX_GDBPARSED(MNT_RELATIME) /* linux/threads.h */ LX_VALUE(NR_CPUS) @@ -93,6 +94,12 @@ LX_GDBPARSED(RADIX_TREE_MAP_SIZE) LX_GDBPARSED(RADIX_TREE_MAP_SHIFT) LX_GDBPARSED(RADIX_TREE_MAP_MASK) +/* linux/maple_tree.h */ +LX_VALUE(MAPLE_NODE_SLOTS) +LX_VALUE(MAPLE_RANGE64_SLOTS) +LX_VALUE(MAPLE_ARANGE64_SLOTS) +LX_GDBPARSED(MAPLE_NODE_MASK) + /* linux/vmalloc.h */ LX_VALUE(VM_IOREMAP) LX_VALUE(VM_ALLOC) diff --git a/scripts/gdb/linux/interrupts.py b/scripts/gdb/linux/interrupts.py index 616a5f26377a8..1401be6a28fb2 100644 --- a/scripts/gdb/linux/interrupts.py +++ b/scripts/gdb/linux/interrupts.py @@ -7,7 +7,7 @@ from linux import constants from linux import cpus from linux import utils -from linux import radixtree +from linux import mapletree irq_desc_type = utils.CachedType("struct irq_desc") @@ -23,12 +23,12 @@ def irqd_is_level(desc): def show_irq_desc(prec, irq): text = "" - desc = radixtree.lookup(gdb.parse_and_eval("&irq_desc_tree"), irq) + desc = mapletree.mtree_load(gdb.parse_and_eval("&sparse_irqs"), irq) if desc is None: return text - desc = desc.cast(irq_desc_type.get_type()) - if desc is None: + desc = desc.cast(irq_desc_type.get_type().pointer()) + if desc == 0: return text if irq_settings_is_hidden(desc): @@ -110,7 +110,7 @@ def x86_show_mce(prec, var, pfx, desc): pvar = gdb.parse_and_eval(var) text = "%*s: " % (prec, pfx) for cpu in cpus.each_online_cpu(): - text += "%10u " % (cpus.per_cpu(pvar, cpu)) + text += "%10u " % (cpus.per_cpu(pvar, cpu).dereference()) text += " %s\n" % (desc) return text @@ -142,7 +142,7 @@ def x86_show_interupts(prec): if constants.LX_CONFIG_X86_MCE: text += x86_show_mce(prec, "&mce_exception_count", "MCE", "Machine check exceptions") - text == x86_show_mce(prec, "&mce_poll_count", "MCP", "Machine check polls") + text += x86_show_mce(prec, "&mce_poll_count", "MCP", "Machine check polls") text += show_irq_err_count(prec) @@ -221,7 +221,7 @@ def invoke(self, arg, from_tty): gdb.write("CPU%-8d" % cpu) gdb.write("\n") - if utils.gdb_eval_or_none("&irq_desc_tree") is None: + if utils.gdb_eval_or_none("&irq_desc_tree") or utils.gdb_eval_or_none("&sparse_irqs") is None: return for irq in range(nr_irqs): diff --git a/scripts/gdb/linux/mapletree.py b/scripts/gdb/linux/mapletree.py new file mode 100644 index 0000000000000..d52d51c0a03fc --- /dev/null +++ b/scripts/gdb/linux/mapletree.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Maple tree helpers +# +# Copyright (c) 2025 Broadcom +# +# Authors: +# Florian Fainelli + +import gdb + +from linux import utils +from linux import constants +from linux import xarray + +maple_tree_root_type = utils.CachedType("struct maple_tree") +maple_node_type = utils.CachedType("struct maple_node") +maple_enode_type = utils.CachedType("void") + +maple_dense = 0 +maple_leaf_64 = 1 +maple_range_64 = 2 +maple_arange_64 = 3 + +class Mas(object): + ma_active = 0 + ma_start = 1 + ma_root = 2 + ma_none = 3 + ma_pause = 4 + ma_overflow = 5 + ma_underflow = 6 + ma_error = 7 + + def __init__(self, mt, first, end): + if mt.type == maple_tree_root_type.get_type().pointer(): + self.tree = mt.dereference() + elif mt.type != maple_tree_root_type.get_type(): + raise gdb.GdbError("must be {} not {}" + .format(maple_tree_root_type.get_type().pointer(), mt.type)) + self.tree = mt + self.index = first + self.last = end + self.node = None + self.status = self.ma_start + self.min = 0 + self.max = -1 + + def is_start(self): + # mas_is_start() + return self.status == self.ma_start + + def is_ptr(self): + # mas_is_ptr() + return self.status == self.ma_root + + def is_none(self): + # mas_is_none() + return self.status == self.ma_none + + def root(self): + # mas_root() + return self.tree['ma_root'].cast(maple_enode_type.get_type().pointer()) + + def start(self): + # mas_start() + if self.is_start() is False: + return None + + self.min = 0 + self.max = ~0 + + while True: + self.depth = 0 + root = self.root() + if xarray.xa_is_node(root): + self.depth = 0 + self.status = self.ma_active + self.node = mte_safe_root(root) + self.offset = 0 + if mte_dead_node(self.node) is True: + continue + + return None + + self.node = None + # Empty tree + if root is None: + self.status = self.ma_none + self.offset = constants.LX_MAPLE_NODE_SLOTS + return None + + # Single entry tree + self.status = self.ma_root + self.offset = constants.LX_MAPLE_NODE_SLOTS + + if self.index != 0: + return None + + return root + + return None + + def reset(self): + # mas_reset() + self.status = self.ma_start + self.node = None + +def mte_safe_root(node): + if node.type != maple_enode_type.get_type().pointer(): + raise gdb.GdbError("{} must be {} not {}" + .format(mte_safe_root.__name__, maple_enode_type.get_type().pointer(), node.type)) + ulong_type = utils.get_ulong_type() + indirect_ptr = node.cast(ulong_type) & ~0x2 + val = indirect_ptr.cast(maple_enode_type.get_type().pointer()) + return val + +def mte_node_type(entry): + ulong_type = utils.get_ulong_type() + val = None + if entry.type == maple_enode_type.get_type().pointer(): + val = entry.cast(ulong_type) + elif entry.type == ulong_type: + val = entry + else: + raise gdb.GdbError("{} must be {} not {}" + .format(mte_node_type.__name__, maple_enode_type.get_type().pointer(), entry.type)) + return (val >> 0x3) & 0xf + +def ma_dead_node(node): + if node.type != maple_node_type.get_type().pointer(): + raise gdb.GdbError("{} must be {} not {}" + .format(ma_dead_node.__name__, maple_node_type.get_type().pointer(), node.type)) + ulong_type = utils.get_ulong_type() + parent = node['parent'] + indirect_ptr = node['parent'].cast(ulong_type) & ~constants.LX_MAPLE_NODE_MASK + return indirect_ptr == node + +def mte_to_node(enode): + ulong_type = utils.get_ulong_type() + if enode.type == maple_enode_type.get_type().pointer(): + indirect_ptr = enode.cast(ulong_type) + elif enode.type == ulong_type: + indirect_ptr = enode + else: + raise gdb.GdbError("{} must be {} not {}" + .format(mte_to_node.__name__, maple_enode_type.get_type().pointer(), enode.type)) + indirect_ptr = indirect_ptr & ~constants.LX_MAPLE_NODE_MASK + return indirect_ptr.cast(maple_node_type.get_type().pointer()) + +def mte_dead_node(enode): + if enode.type != maple_enode_type.get_type().pointer(): + raise gdb.GdbError("{} must be {} not {}" + .format(mte_dead_node.__name__, maple_enode_type.get_type().pointer(), enode.type)) + node = mte_to_node(enode) + return ma_dead_node(node) + +def ma_is_leaf(tp): + result = tp < maple_range_64 + return tp < maple_range_64 + +def mt_pivots(t): + if t == maple_dense: + return 0 + elif t == maple_leaf_64 or t == maple_range_64: + return constants.LX_MAPLE_RANGE64_SLOTS - 1 + elif t == maple_arange_64: + return constants.LX_MAPLE_ARANGE64_SLOTS - 1 + +def ma_pivots(node, t): + if node.type != maple_node_type.get_type().pointer(): + raise gdb.GdbError("{}: must be {} not {}" + .format(ma_pivots.__name__, maple_node_type.get_type().pointer(), node.type)) + if t == maple_arange_64: + return node['ma64']['pivot'] + elif t == maple_leaf_64 or t == maple_range_64: + return node['mr64']['pivot'] + else: + return None + +def ma_slots(node, tp): + if node.type != maple_node_type.get_type().pointer(): + raise gdb.GdbError("{}: must be {} not {}" + .format(ma_slots.__name__, maple_node_type.get_type().pointer(), node.type)) + if tp == maple_arange_64: + return node['ma64']['slot'] + elif tp == maple_range_64 or tp == maple_leaf_64: + return node['mr64']['slot'] + elif tp == maple_dense: + return node['slot'] + else: + return None + +def mt_slot(mt, slots, offset): + ulong_type = utils.get_ulong_type() + return slots[offset].cast(ulong_type) + +def mtree_lookup_walk(mas): + ulong_type = utils.get_ulong_type() + n = mas.node + + while True: + node = mte_to_node(n) + tp = mte_node_type(n) + pivots = ma_pivots(node, tp) + end = mt_pivots(tp) + offset = 0 + while True: + if pivots[offset] >= mas.index: + break + if offset >= end: + break + offset += 1 + + slots = ma_slots(node, tp) + n = mt_slot(mas.tree, slots, offset) + if ma_dead_node(node) is True: + mas.reset() + return None + break + + if ma_is_leaf(tp) is True: + break + + return n + +def mtree_load(mt, index): + ulong_type = utils.get_ulong_type() + # MT_STATE(...) + mas = Mas(mt, index, index) + entry = None + + while True: + entry = mas.start() + if mas.is_none(): + return None + + if mas.is_ptr(): + if index != 0: + entry = None + return entry + + entry = mtree_lookup_walk(mas) + if entry is None and mas.is_start(): + continue + else: + break + + if xarray.xa_is_zero(entry): + return None + + return entry diff --git a/scripts/gdb/linux/vfs.py b/scripts/gdb/linux/vfs.py index c77b9ce75f6d2..b5fbb18ccb77a 100644 --- a/scripts/gdb/linux/vfs.py +++ b/scripts/gdb/linux/vfs.py @@ -22,7 +22,7 @@ def dentry_name(d): if parent == d or parent == 0: return "" p = dentry_name(d['d_parent']) + "/" - return p + d['d_iname'].string() + return p + d['d_shortname']['string'].string() class DentryName(gdb.Function): """Return string of the full path of a dentry. diff --git a/scripts/gdb/linux/xarray.py b/scripts/gdb/linux/xarray.py new file mode 100644 index 0000000000000..f4477b5def75f --- /dev/null +++ b/scripts/gdb/linux/xarray.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Xarray helpers +# +# Copyright (c) 2025 Broadcom +# +# Authors: +# Florian Fainelli + +import gdb + +from linux import utils +from linux import constants + +def xa_is_internal(entry): + ulong_type = utils.get_ulong_type() + return ((entry.cast(ulong_type) & 3) == 2) + +def xa_mk_internal(v): + return ((v << 2) | 2) + +def xa_is_zero(entry): + ulong_type = utils.get_ulong_type() + return entry.cast(ulong_type) == xa_mk_internal(257) + +def xa_is_node(entry): + ulong_type = utils.get_ulong_type() + return xa_is_internal(entry) and (entry.cast(ulong_type) > 4096) diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile index 11def1ad046c8..20bbd461515e5 100644 --- a/tools/accounting/Makefile +++ b/tools/accounting/Makefile @@ -2,7 +2,7 @@ CC := $(CROSS_COMPILE)gcc CFLAGS := -I../../usr/include -PROGS := getdelays procacct +PROGS := getdelays procacct delaytop all: $(PROGS) diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c new file mode 100644 index 0000000000000..23e38f39e97d0 --- /dev/null +++ b/tools/accounting/delaytop.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * delaytop.c - task delay monitoring tool. + * + * This tool provides real-time monitoring and statistics of + * system, container, and task-level delays, including CPU, + * memory, IO, and IRQ and delay accounting. It supports both + * interactive (top-like), and can output delay information + * for the whole system, specific containers (cgroups), or + * individual tasks (PIDs). + * + * Key features: + * - Collects per-task delay accounting statistics via taskstats. + * - Supports sorting, filtering. + * - Supports both interactive (screen refresh). + * + * Copyright (C) Fan Yu, ZTE Corp. 2025 + * Copyright (C) Wang Yaxin, ZTE Corp. 2025 + * + * Compile with + * gcc -I/usr/src/linux/include delaytop.c -o delaytop + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) + +#define TASK_COMM_LEN 16 +#define MAX_MSG_SIZE 1024 +#define MAX_TASKS 1000 +#define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field + +/* Program settings structure */ +struct config { + int delay; /* Update interval in seconds */ + int iterations; /* Number of iterations, 0 == infinite */ + int max_processes; /* Maximum number of processes to show */ + char sort_field; /* Field to sort by */ + int output_one_time; /* Output once and exit */ + int monitor_pid; /* Monitor specific PID */ + char *container_path; /* Path to container cgroup */ +}; + +/* Task delay information structure */ +struct task_info { + int pid; + int tgid; + char command[TASK_COMM_LEN]; + unsigned long long cpu_count; + unsigned long long cpu_delay_total; + unsigned long long blkio_count; + unsigned long long blkio_delay_total; + unsigned long long swapin_count; + unsigned long long swapin_delay_total; + unsigned long long freepages_count; + unsigned long long freepages_delay_total; + unsigned long long thrashing_count; + unsigned long long thrashing_delay_total; + unsigned long long compact_count; + unsigned long long compact_delay_total; + unsigned long long wpcopy_count; + unsigned long long wpcopy_delay_total; + unsigned long long irq_count; + unsigned long long irq_delay_total; +}; + +/* Container statistics structure */ +struct container_stats { + int nr_sleeping; /* Number of sleeping processes */ + int nr_running; /* Number of running processes */ + int nr_stopped; /* Number of stopped processes */ + int nr_uninterruptible; /* Number of uninterruptible processes */ + int nr_io_wait; /* Number of processes in IO wait */ +}; + +/* Global variables */ +static struct config cfg; +static struct task_info tasks[MAX_TASKS]; +static int task_count; +static int running = 1; +static struct container_stats container_stats; + +/* Netlink socket variables */ +static int nl_sd = -1; +static int family_id; + +/* Set terminal to non-canonical mode for q-to-quit */ +static struct termios orig_termios; +static void enable_raw_mode(void) +{ + struct termios raw; + + tcgetattr(STDIN_FILENO, &orig_termios); + raw = orig_termios; + raw.c_lflag &= ~(ICANON | ECHO); + tcsetattr(STDIN_FILENO, TCSAFLUSH, &raw); +} +static void disable_raw_mode(void) +{ + tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); +} + +/* Display usage information and command line options */ +static void usage(void) +{ + printf("Usage: delaytop [Options]\n" + "Options:\n" + " -h, --help Show this help message and exit\n" + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" + " -o, --once Display once and exit\n" + " -p, --pid=PID Monitor only the specified PID\n" + " -C, --container=PATH Monitor the container at specified cgroup path\n"); + exit(0); +} + +/* Parse command line arguments and set configuration */ +static void parse_args(int argc, char **argv) +{ + int c; + struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"delay", required_argument, 0, 'd'}, + {"iterations", required_argument, 0, 'n'}, + {"pid", required_argument, 0, 'p'}, + {"once", no_argument, 0, 'o'}, + {"processes", required_argument, 0, 'P'}, + {"container", required_argument, 0, 'C'}, + {0, 0, 0, 0} + }; + + /* Set defaults */ + cfg.delay = 2; + cfg.iterations = 0; + cfg.max_processes = 20; + cfg.sort_field = 'c'; /* Default sort by CPU delay */ + cfg.output_one_time = 0; + cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ + cfg.container_path = NULL; + + while (1) { + int option_index = 0; + + c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + usage(); + break; + case 'd': + cfg.delay = atoi(optarg); + if (cfg.delay < 1) { + fprintf(stderr, "Error: delay must be >= 1.\n"); + exit(1); + } + break; + case 'n': + cfg.iterations = atoi(optarg); + if (cfg.iterations < 0) { + fprintf(stderr, "Error: iterations must be >= 0.\n"); + exit(1); + } + break; + case 'p': + cfg.monitor_pid = atoi(optarg); + if (cfg.monitor_pid < 1) { + fprintf(stderr, "Error: pid must be >= 1.\n"); + exit(1); + } + break; + case 'o': + cfg.output_one_time = 1; + break; + case 'P': + cfg.max_processes = atoi(optarg); + if (cfg.max_processes < 1) { + fprintf(stderr, "Error: processes must be >= 1.\n"); + exit(1); + } + if (cfg.max_processes > MAX_TASKS) { + fprintf(stderr, "Warning: processes capped to %d.\n", + MAX_TASKS); + cfg.max_processes = MAX_TASKS; + } + break; + case 'C': + cfg.container_path = strdup(optarg); + break; + default: + fprintf(stderr, "Try 'delaytop --help' for more information.\n"); + exit(1); + } + } +} + +/* Create a raw netlink socket and bind */ +static int create_nl_socket(void) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd < 0) + return -1; + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) { + close(fd); + return -1; + } + + return fd; +} + +/* Send a command via netlink */ +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct sockaddr_nl nladdr; + struct nlattr *na; + int r, buflen; + char *buf; + + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + +/* Get family ID for taskstats via netlink */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + char name[100]; + + strncpy(name, TASKSTATS_GENL_NAME, sizeof(name) - 1); + name[sizeof(name) - 1] = '\0'; + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) + return 0; + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) + return 0; + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + return id; +} + +static int read_comm(int pid, char *comm_buf, size_t buf_size) +{ + char path[64]; + size_t len; + FILE *fp; + + snprintf(path, sizeof(path), "/proc/%d/comm", pid); + fp = fopen(path, "r"); + if (!fp) + return -1; + if (fgets(comm_buf, buf_size, fp)) { + len = strlen(comm_buf); + if (len > 0 && comm_buf[len - 1] == '\n') + comm_buf[len - 1] = '\0'; + } else { + fclose(fp); + return -1; + } + fclose(fp); + return 0; +} + +static int fetch_and_fill_task_info(int pid, const char *comm) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } resp; + struct taskstats stats; + struct nlattr *nested; + struct nlattr *na; + int nested_len; + int nl_len; + int rc; + + if (send_cmd(nl_sd, family_id, getpid(), TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_PID, &pid, sizeof(pid)) < 0) { + return -1; + } + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) + return -1; + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == TASKSTATS_TYPE_AGGR_PID) { + nested = (struct nlattr *) NLA_DATA(na); + nested_len = NLA_PAYLOAD(na->nla_len); + while (nested_len > 0) { + if (nested->nla_type == TASKSTATS_TYPE_STATS) { + memcpy(&stats, NLA_DATA(nested), sizeof(stats)); + if (task_count < MAX_TASKS) { + tasks[task_count].pid = pid; + tasks[task_count].tgid = pid; + strncpy(tasks[task_count].command, comm, + TASK_COMM_LEN - 1); + tasks[task_count].command[TASK_COMM_LEN - 1] = '\0'; + SET_TASK_STAT(task_count, cpu_count); + SET_TASK_STAT(task_count, cpu_delay_total); + SET_TASK_STAT(task_count, blkio_count); + SET_TASK_STAT(task_count, blkio_delay_total); + SET_TASK_STAT(task_count, swapin_count); + SET_TASK_STAT(task_count, swapin_delay_total); + SET_TASK_STAT(task_count, freepages_count); + SET_TASK_STAT(task_count, freepages_delay_total); + SET_TASK_STAT(task_count, thrashing_count); + SET_TASK_STAT(task_count, thrashing_delay_total); + SET_TASK_STAT(task_count, compact_count); + SET_TASK_STAT(task_count, compact_delay_total); + SET_TASK_STAT(task_count, wpcopy_count); + SET_TASK_STAT(task_count, wpcopy_delay_total); + SET_TASK_STAT(task_count, irq_count); + SET_TASK_STAT(task_count, irq_delay_total); + task_count++; + } + break; + } + nested_len -= NLA_ALIGN(nested->nla_len); + nested = NLA_NEXT(nested); + } + } + nl_len -= NLA_ALIGN(na->nla_len); + na = NLA_NEXT(na); + } + return 0; +} + +static void get_task_delays(void) +{ + char comm[TASK_COMM_LEN]; + struct dirent *entry; + DIR *dir; + int pid; + + task_count = 0; + if (cfg.monitor_pid > 0) { + if (read_comm(cfg.monitor_pid, comm, sizeof(comm)) == 0) + fetch_and_fill_task_info(cfg.monitor_pid, comm); + return; + } + + dir = opendir("/proc"); + if (!dir) { + fprintf(stderr, "Error opening /proc directory\n"); + return; + } + + while ((entry = readdir(dir)) != NULL && task_count < MAX_TASKS) { + if (!isdigit(entry->d_name[0])) + continue; + pid = atoi(entry->d_name); + if (pid == 0) + continue; + if (read_comm(pid, comm, sizeof(comm)) != 0) + continue; + fetch_and_fill_task_info(pid, comm); + } + closedir(dir); +} + +/* Calculate average delay in milliseconds */ +static double average_ms(unsigned long long total, unsigned long long count) +{ + if (count == 0) + return 0; + return (double)total / 1000000.0 / count; +} + +/* Comparison function for sorting tasks */ +static int compare_tasks(const void *a, const void *b) +{ + const struct task_info *t1 = (const struct task_info *)a; + const struct task_info *t2 = (const struct task_info *)b; + double avg1, avg2; + + switch (cfg.sort_field) { + case 'c': /* CPU */ + avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); + avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); + if (avg1 != avg2) + return avg2 > avg1 ? 1 : -1; + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + + default: + return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + } +} + +/* Sort tasks by selected field */ +static void sort_tasks(void) +{ + if (task_count > 0) + qsort(tasks, task_count, sizeof(struct task_info), compare_tasks); +} + +/* Get container statistics via cgroupstats */ +static void get_container_stats(void) +{ + int rc, cfd; + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; + } req, resp; + struct nlattr *na; + int nl_len; + struct cgroupstats stats; + + /* Check if container path is set */ + if (!cfg.container_path) + return; + + /* Open container cgroup */ + cfd = open(cfg.container_path, O_RDONLY); + if (cfd < 0) { + fprintf(stderr, "Error opening container path: %s\n", cfg.container_path); + return; + } + + /* Send request for container stats */ + if (send_cmd(nl_sd, family_id, getpid(), CGROUPSTATS_CMD_GET, + CGROUPSTATS_CMD_ATTR_FD, &cfd, sizeof(__u32)) < 0) { + fprintf(stderr, "Failed to send request for container stats\n"); + close(cfd); + return; + } + + /* Receive response */ + rc = recv(nl_sd, &resp, sizeof(resp), 0); + if (rc < 0 || resp.n.nlmsg_type == NLMSG_ERROR) { + fprintf(stderr, "Failed to receive response for container stats\n"); + close(cfd); + return; + } + + /* Parse response */ + nl_len = GENLMSG_PAYLOAD(&resp.n); + na = (struct nlattr *) GENLMSG_DATA(&resp); + while (nl_len > 0) { + if (na->nla_type == CGROUPSTATS_TYPE_CGROUP_STATS) { + /* Get the cgroupstats structure */ + memcpy(&stats, NLA_DATA(na), sizeof(stats)); + + /* Fill container stats */ + container_stats.nr_sleeping = stats.nr_sleeping; + container_stats.nr_running = stats.nr_running; + container_stats.nr_stopped = stats.nr_stopped; + container_stats.nr_uninterruptible = stats.nr_uninterruptible; + container_stats.nr_io_wait = stats.nr_io_wait; + break; + } + nl_len -= NLA_ALIGN(na->nla_len); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + } + + close(cfd); +} + +/* Display results to stdout or log file */ +static void display_results(void) +{ + time_t now = time(NULL); + struct tm *tm_now = localtime(&now); + char timestamp[32]; + int i, count; + FILE *out = stdout; + + fprintf(out, "\033[H\033[J"); + + if (cfg.container_path) { + fprintf(out, "Container Information (%s):\n", cfg.container_path); + fprintf(out, "Processes: running=%d, sleeping=%d, ", + container_stats.nr_running, container_stats.nr_sleeping); + fprintf(out, "stopped=%d, uninterruptible=%d, io_wait=%d\n\n", + container_stats.nr_stopped, container_stats.nr_uninterruptible, + container_stats.nr_io_wait); + } + fprintf(out, "Top %d processes (sorted by CPU delay):\n\n", + cfg.max_processes); + fprintf(out, " PID TGID COMMAND CPU(ms) IO(ms) "); + fprintf(out, "SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms)\n"); + fprintf(out, "-----------------------------------------------"); + fprintf(out, "----------------------------------------------\n"); + count = task_count < cfg.max_processes ? task_count : cfg.max_processes; + + for (i = 0; i < count; i++) { + fprintf(out, "%5d %5d %-15s ", + tasks[i].pid, tasks[i].tgid, tasks[i].command); + fprintf(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", + average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), + average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), + average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), + average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), + average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), + average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), + average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), + average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); + } + + fprintf(out, "\n"); +} + +/* Main function */ +int main(int argc, char **argv) +{ + int iterations = 0; + int use_q_quit = 0; + + /* Parse command line arguments */ + parse_args(argc, argv); + + /* Setup netlink socket */ + nl_sd = create_nl_socket(); + if (nl_sd < 0) { + fprintf(stderr, "Error creating netlink socket\n"); + exit(1); + } + + /* Get family ID for taskstats via netlink */ + family_id = get_family_id(nl_sd); + if (!family_id) { + fprintf(stderr, "Error getting taskstats family ID\n"); + close(nl_sd); + exit(1); + } + + if (!cfg.output_one_time) { + use_q_quit = 1; + enable_raw_mode(); + printf("Press 'q' to quit.\n"); + fflush(stdout); + } + + /* Main loop */ + while (running) { + /* Get container stats if container path provided */ + if (cfg.container_path) + get_container_stats(); + + /* Get task delays */ + get_task_delays(); + + /* Sort tasks */ + sort_tasks(); + + /* Display results to stdout or log file */ + display_results(); + + /* Check for iterations */ + if (cfg.iterations > 0 && ++iterations >= cfg.iterations) + break; + + /* Exit if output_one_time is set */ + if (cfg.output_one_time) + break; + + /* Check for 'q' key to quit */ + if (use_q_quit) { + struct timeval tv = {cfg.delay, 0}; + fd_set readfds; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); + + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { + char ch = 0; + + read(STDIN_FILENO, &ch, 1); + if (ch == 'q' || ch == 'Q') { + running = 0; + break; + } + } + } else { + sleep(cfg.delay); + } + } + + /* Restore terminal mode */ + if (use_q_quit) + disable_raw_mode(); + + /* Cleanup */ + close(nl_sd); + if (cfg.container_path) + free(cfg.container_path); + + return 0; +} diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 43dec6eed559a..9785c1d49f05e 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -255,7 +255,12 @@ struct prctl_mm_map { /* Dispatch syscalls to a userspace handler */ #define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 +/* Enable dispatch except for the specified range */ +# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 +/* Enable dispatch for the specified range */ +# define PR_SYS_DISPATCH_INCLUSIVE_ON 2 +/* Legacy name for backwards compatibility */ +# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON /* The control values for the user space selector when dispatch is enabled */ # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c index c397336fe1ed1..c68713439b33a 100644 --- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c @@ -8,22 +8,31 @@ #include #include #include +#include #include #include #include #include #include "uprobe_syscall.skel.h" #include "uprobe_syscall_executed.skel.h" +#include "bpf/libbpf_internal.h" -__naked unsigned long uretprobe_regs_trigger(void) +#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00 +#include "usdt.h" + +#pragma GCC diagnostic ignored "-Wattributes" + +__attribute__((aligned(16))) +__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void) { asm volatile ( + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */ "movq $0xdeadbeef, %rax\n" "ret\n" ); } -__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) +__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after) { asm volatile ( "movq %r15, 0(%rdi)\n" @@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) "movq $0, 120(%rdi)\n" /* orig_rax */ "movq $0, 128(%rdi)\n" /* rip */ "movq $0, 136(%rdi)\n" /* cs */ + "pushq %rax\n" "pushf\n" "pop %rax\n" "movq %rax, 144(%rdi)\n" /* eflags */ + "pop %rax\n" "movq %rsp, 152(%rdi)\n" /* rsp */ "movq $0, 160(%rdi)\n" /* ss */ /* save 2nd argument */ "pushq %rsi\n" - "call uretprobe_regs_trigger\n" + "call uprobe_regs_trigger\n" /* save return value and load 2nd argument pointer to rax */ "pushq %rax\n" @@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after) ); } -static void test_uretprobe_regs_equal(void) +static void test_uprobe_regs_equal(bool retprobe) { + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = retprobe, + ); struct uprobe_syscall *skel = NULL; struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; unsigned long *pa = (unsigned long *) &after; unsigned long *pp; + unsigned long offset; unsigned int i, cnt; - int err; + + offset = get_uprobe_offset(&uprobe_regs_trigger); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return; skel = uprobe_syscall__open_and_load(); if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load")) goto cleanup; - err = uprobe_syscall__attach(skel); - if (!ASSERT_OK(err, "uprobe_syscall__attach")) + skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts")) goto cleanup; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + if (!retprobe) + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); pp = (unsigned long *) &skel->bss->regs; cnt = sizeof(before)/sizeof(*pb); @@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void) unsigned int offset = i * sizeof(unsigned long); /* - * Check register before and after uretprobe_regs_trigger call + * Check register before and after uprobe_regs_trigger call * that triggers the uretprobe. */ switch (offset) { @@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void) /* * Check register seen from bpf program and register after - * uretprobe_regs_trigger call + * uprobe_regs_trigger call (with rax exception, check below). */ switch (offset) { /* @@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void) case offsetof(struct pt_regs, rsp): case offsetof(struct pt_regs, ss): break; + /* + * uprobe does not see return value in rax, it needs to see the + * original (before) rax value + */ + case offsetof(struct pt_regs, rax): + if (!retprobe) { + ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check"); + break; + } default: if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check")) fprintf(stdout, "failed register offset %u\n", offset); @@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset) return ret != n ? (int) ret : 0; } -static void test_uretprobe_regs_change(void) +static void test_regs_change(void) { struct pt_regs before = {}, after = {}; unsigned long *pb = (unsigned long *) &before; @@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void) unsigned long cnt = sizeof(before)/sizeof(*pb); unsigned int i, err, offset; - offset = get_uprobe_offset(uretprobe_regs_trigger); + offset = get_uprobe_offset(uprobe_regs_trigger); err = write_bpf_testmod_uprobe(offset); if (!ASSERT_OK(err, "register_uprobe")) return; - uretprobe_regs(&before, &after); + /* make sure uprobe gets optimized */ + uprobe_regs_trigger(); + + uprobe_regs(&before, &after); err = write_bpf_testmod_uprobe(0); if (!ASSERT_OK(err, "unregister_uprobe")) @@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void) ); struct uprobe_syscall_executed *skel; int pid, status, err, go[2], c; + struct bpf_link *link; if (!ASSERT_OK(pipe(go), "pipe")) return; @@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void) _exit(0); } - skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid, - "/proc/self/exe", - "uretprobe_syscall_call", &opts); - if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi")) + skel->bss->pid = pid; + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + pid, "/proc/self/exe", + "uretprobe_syscall_call", &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) goto cleanup; + skel->links.test_uretprobe_multi = link; /* kick the child */ write(go[1], &c, 1); @@ -301,6 +340,265 @@ static void test_uretprobe_syscall_call(void) close(go[0]); } +#define TRAMP "[uprobes-trampoline]" + +__attribute__((aligned(16))) +__nocf_check __weak __naked void uprobe_test(void) +{ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} + +__attribute__((aligned(16))) +__nocf_check __weak void usdt_test(void) +{ + USDT(optimized_uprobe, usdt); +} + +static int find_uprobes_trampoline(void *tramp_addr) +{ + void *start, *end; + char line[128]; + int ret = -1; + FILE *maps; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + fprintf(stderr, "cannot open maps\n"); + return -1; + } + + while (fgets(line, sizeof(line), maps)) { + int m = -1; + + /* We care only about private r-x mappings. */ + if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2) + continue; + if (m < 0) + continue; + if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) { + ret = 0; + break; + } + } + + fclose(maps); + return ret; +} + +static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; + +static void *find_nop5(void *fn) +{ + int i; + + for (i = 0; i < 10; i++) { + if (!memcmp(nop5, fn + i, 5)) + return fn + i; + } + return NULL; +} + +typedef void (__attribute__((nocf_check)) *trigger_t)(void); + +static bool shstk_is_enabled; + +static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger, + void *addr, int executed) +{ + struct __arch_relative_insn { + __u8 op; + __s32 raddr; + } __packed *call; + void *tramp = NULL; + __u8 *bp; + + /* Uprobe gets optimized after first trigger, so let's press twice. */ + trigger(); + trigger(); + + /* Make sure bpf program got executed.. */ + ASSERT_EQ(skel->bss->executed, executed, "executed"); + + if (shstk_is_enabled) { + /* .. and check optimization is disabled under shadow stack. */ + bp = (__u8 *) addr; + ASSERT_EQ(*bp, 0xcc, "int3"); + } else { + /* .. and check the trampoline is as expected. */ + call = (struct __arch_relative_insn *) addr; + tramp = (void *) (call + 1) + call->raddr; + ASSERT_EQ(call->op, 0xe8, "call"); + ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline"); + } + + return tramp; +} + +static void check_detach(void *addr, void *tramp) +{ + /* [uprobes_trampoline] stays after detach */ + ASSERT_OK(!shstk_is_enabled && find_uprobes_trampoline(tramp), "uprobes_trampoline"); + ASSERT_OK(memcmp(addr, nop5, 5), "nop5"); +} + +static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link, + trigger_t trigger, void *addr, int executed) +{ + void *tramp; + + tramp = check_attach(skel, trigger, addr, executed); + bpf_link__destroy(link); + check_detach(addr, tramp); +} + +static void test_uprobe_legacy(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_opts, opts, + .retprobe = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + /* uprobe */ + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe */ + skel->bss->executed = 0; + + link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_multi(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + /* uprobe.multi */ + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + + /* uretprobe.multi */ + skel->bss->executed = 0; + opts.retprobe = true; + link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_session(void) +{ + struct uprobe_syscall_executed *skel = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts, + .session = true, + ); + struct bpf_link *link; + unsigned long offset; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + goto cleanup; + + opts.offsets = &offset; + opts.cnt = 1; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session, + 0, "/proc/self/exe", NULL, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi")) + goto cleanup; + + check(skel, link, uprobe_test, uprobe_test, 4); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + +static void test_uprobe_usdt(void) +{ + struct uprobe_syscall_executed *skel; + struct bpf_link *link; + void *addr; + + errno = 0; + addr = find_nop5(usdt_test); + if (!ASSERT_OK_PTR(addr, "find_nop5")) + return; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return; + + skel->bss->pid = getpid(); + + link = bpf_program__attach_usdt(skel->progs.test_usdt, + -1 /* all PIDs */, "/proc/self/exe", + "optimized_uprobe", "usdt", NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt")) + goto cleanup; + + check(skel, link, usdt_test, addr, 2); + +cleanup: + uprobe_syscall_executed__destroy(skel); +} + /* * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c. * @@ -343,43 +641,259 @@ static void test_uretprobe_shadow_stack(void) return; } - /* Run all of the uretprobe tests. */ - test_uretprobe_regs_equal(); - test_uretprobe_regs_change(); + /* Run all the tests with shadow stack in place. */ + shstk_is_enabled = true; + + test_uprobe_regs_equal(false); + test_uprobe_regs_equal(true); test_uretprobe_syscall_call(); + test_uprobe_legacy(); + test_uprobe_multi(); + test_uprobe_session(); + test_uprobe_usdt(); + + test_regs_change(); + + shstk_is_enabled = false; + ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); } -#else -static void test_uretprobe_regs_equal(void) + +static volatile bool race_stop; + +static USDT_DEFINE_SEMA(race); + +static void *worker_trigger(void *arg) { - test__skip(); + unsigned long rounds = 0; + + while (!race_stop) { + uprobe_test(); + rounds++; + } + + printf("tid %d trigger rounds: %lu\n", gettid(), rounds); + return NULL; } -static void test_uretprobe_regs_change(void) +static void *worker_attach(void *arg) { - test__skip(); + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct uprobe_syscall_executed *skel; + unsigned long rounds = 0, offset; + const char *sema[2] = { + __stringify(USDT_SEMA(race)), + NULL, + }; + unsigned long *ref; + int err; + + offset = get_uprobe_offset(&uprobe_test); + if (!ASSERT_GE(offset, 0, "get_uprobe_offset")) + return NULL; + + err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT); + if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema")) + return NULL; + + opts.ref_ctr_offset = *ref; + + skel = uprobe_syscall_executed__open_and_load(); + if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load")) + return NULL; + + skel->bss->pid = getpid(); + + while (!race_stop) { + skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe, + 0, "/proc/self/exe", offset, &opts); + if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts")) + break; + + bpf_link__destroy(skel->links.test_uprobe); + skel->links.test_uprobe = NULL; + rounds++; + } + + printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed); + uprobe_syscall_executed__destroy(skel); + free(ref); + return NULL; } -static void test_uretprobe_syscall_call(void) +static useconds_t race_msec(void) { - test__skip(); + char *env; + + env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC"); + if (env) + return atoi(env); + + /* default duration is 500ms */ + return 500; } -static void test_uretprobe_shadow_stack(void) +static void test_uprobe_race(void) { - test__skip(); + int err, i, nr_threads; + pthread_t *threads; + + nr_threads = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus")) + return; + nr_threads = max(2, nr_threads); + + threads = alloca(sizeof(*threads) * nr_threads); + if (!ASSERT_OK_PTR(threads, "malloc")) + return; + + for (i = 0; i < nr_threads; i++) { + err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach, + NULL); + if (!ASSERT_OK(err, "pthread_create")) + goto cleanup; + } + + usleep(race_msec() * 1000); + +cleanup: + race_stop = true; + for (nr_threads = i, i = 0; i < nr_threads; i++) + pthread_join(threads[i], NULL); + + ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore"); } + +#ifndef __NR_uprobe +#define __NR_uprobe 336 #endif -void test_uprobe_syscall(void) +static void test_uprobe_sigill(void) +{ + int status, err, pid; + + pid = fork(); + if (!ASSERT_GE(pid, 0, "fork")) + return; + /* child */ + if (pid == 0) { + asm volatile ( + "pushq %rax\n" + "pushq %rcx\n" + "pushq %r11\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "popq %r11\n" + "popq %rcx\n" + "retq\n" + ); + exit(0); + } + + err = waitpid(pid, &status, 0); + ASSERT_EQ(err, pid, "waitpid"); + + /* verify the child got killed with SIGILL */ + ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED"); + ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG"); +} + +#define UPROBE_TRAMPOLINE_TEST_FILE "/sys/kernel/uprobe_trampoline" + +static int write_trampoline(unsigned long vaddr) +{ + size_t n, ret; + char buf[30]; + int fd; + + fprintf(stderr, "KRAVA vaddr %lx\n", vaddr); + + n = sprintf(buf, "%lu", vaddr); + + fd = open(UPROBE_TRAMPOLINE_TEST_FILE, O_WRONLY); + if (fd < 0) + return -errno; + + ret = write(fd, buf, n); + close(fd); + return ret != n ? (int) ret : 0; +} + +#define TASK_SIZE 0x7ffffffff000 +static void test_trampoline(void) +{ + int i; + + fprintf(stderr, "KRAVA 1\n"); + + write_trampoline(0x2000); + write_trampoline((1UL << 32) + 0x2000); + write_trampoline((1UL << 32)*10 + 0x1000); + write_trampoline((1UL << 32)*10 - 0x1000); + + fprintf(stderr, "KRAVA 2\n"); + + for (i = 0; i < 10; i++) { + write_trampoline((1UL << 32)*10 - i*1*(1UL << 30)); + write_trampoline((1UL << 32)*10 - i*1*(1UL << 30) + 4096); + } + + fprintf(stderr, "KRAVA 3\n"); + + for (i = 0; i < 10; i++) { + write_trampoline((1UL << 32)*10 + i*1*(1UL << 30)); + write_trampoline((1UL << 32)*10 + i*1*(1UL << 30) + 4096); + } + + fprintf(stderr, "KRAVA 4\n"); + + write_trampoline(TASK_SIZE - 0x1000); + write_trampoline(TASK_SIZE - 0x2000); + + fprintf(stderr, "KRAVA 5\n"); + + for (i = 0; i < 10; i++) { + write_trampoline(TASK_SIZE + i*1*(1UL << 30)); + write_trampoline(TASK_SIZE + i*1*(1UL << 30) + 4096); + } +} + +static void __test_uprobe_syscall(void) { if (test__start_subtest("uretprobe_regs_equal")) - test_uretprobe_regs_equal(); - if (test__start_subtest("uretprobe_regs_change")) - test_uretprobe_regs_change(); + test_uprobe_regs_equal(true); if (test__start_subtest("uretprobe_syscall_call")) test_uretprobe_syscall_call(); if (test__start_subtest("uretprobe_shadow_stack")) test_uretprobe_shadow_stack(); + if (test__start_subtest("uprobe_legacy")) + test_uprobe_legacy(); + if (test__start_subtest("uprobe_multi")) + test_uprobe_multi(); + if (test__start_subtest("uprobe_session")) + test_uprobe_session(); + if (test__start_subtest("uprobe_usdt")) + test_uprobe_usdt(); + if (test__start_subtest("uprobe_race")) + test_uprobe_race(); + if (test__start_subtest("uprobe_sigill")) + test_uprobe_sigill(); + if (test__start_subtest("uprobe_regs_equal")) + test_uprobe_regs_equal(false); + if (test__start_subtest("regs_change")) + test_regs_change(); + if (test__start_subtest("trampoline")) + test_trampoline(); +} +#else +static void __test_uprobe_syscall(void) +{ + test__skip(); +} +#endif + +void test_uprobe_syscall(void) +{ + __test_uprobe_syscall(); } diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c index 9057e983cc540..833eb87483a1c 100644 --- a/tools/testing/selftests/bpf/prog_tests/usdt.c +++ b/tools/testing/selftests/bpf/prog_tests/usdt.c @@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) { } } -static void subtest_basic_usdt(void) +static void subtest_basic_usdt(bool optimized) { LIBBPF_OPTS(bpf_usdt_opts, opts); struct test_usdt *skel; struct test_usdt__bss *bss; - int err, i; + int err, i, called; + +#define TRIGGER(x) ({ \ + trigger_func(x); \ + if (optimized) \ + trigger_func(x); \ + optimized ? 2 : 1; \ + }) skel = test_usdt__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) @@ -66,11 +73,11 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link")) goto cleanup; - trigger_func(1); + called = TRIGGER(1); - ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie"); ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt"); @@ -119,11 +126,11 @@ static void subtest_basic_usdt(void) * bpf_program__attach_usdt() handles this properly and attaches to * all possible places of USDT invocation. */ - trigger_func(2); + called += TRIGGER(2); - ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called"); - ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called"); - ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called"); + ASSERT_EQ(bss->usdt0_called, called, "usdt0_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); + ASSERT_EQ(bss->usdt12_called, called, "usdt12_called"); /* only check values that depend on trigger_func()'s input value */ ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1"); @@ -142,9 +149,9 @@ static void subtest_basic_usdt(void) if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach")) goto cleanup; - trigger_func(3); + called += TRIGGER(3); - ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called"); + ASSERT_EQ(bss->usdt3_called, called, "usdt3_called"); /* this time usdt3 has custom cookie */ ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie"); ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt"); @@ -158,6 +165,7 @@ static void subtest_basic_usdt(void) cleanup: test_usdt__destroy(skel); +#undef TRIGGER } unsigned short test_usdt_100_semaphore SEC(".probes"); @@ -425,7 +433,11 @@ static void subtest_urandom_usdt(bool auto_attach) void test_usdt(void) { if (test__start_subtest("basic")) - subtest_basic_usdt(); + subtest_basic_usdt(false); +#ifdef __x86_64__ + if (test__start_subtest("basic_optimized")) + subtest_basic_usdt(true); +#endif if (test__start_subtest("multispec")) subtest_multispec_usdt(); if (test__start_subtest("urand_auto_attach")) diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall.c b/tools/testing/selftests/bpf/progs/uprobe_syscall.c index 8a4fa6c7ef590..e08c31669e5a7 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall.c @@ -7,8 +7,8 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; -SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger") -int uretprobe(struct pt_regs *ctx) +SEC("uprobe") +int probe(struct pt_regs *ctx) { __builtin_memcpy(®s, ctx, sizeof(regs)); return 0; diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c index 0d7f1a7db2e2e..915d38591bf6a 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c +++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" #include +#include +#include #include struct pt_regs regs; @@ -8,10 +10,64 @@ struct pt_regs regs; char _license[] SEC("license") = "GPL"; int executed = 0; +int pid; + +SEC("uprobe") +int BPF_UPROBE(test_uprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(test_uretprobe) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.multi") +int test_uprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} SEC("uretprobe.multi") -int test(struct pt_regs *regs) +int test_uretprobe_multi(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("uprobe.session") +int test_uprobe_session(struct pt_regs *ctx) { - executed = 1; + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; + return 0; +} + +SEC("usdt") +int test_usdt(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + executed++; return 0; } diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e9e918cdf31ff..511911053bdc5 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -500,15 +500,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { */ #ifdef __x86_64__ +static int +uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data) +{ + regs->cx = 0x87654321feebdaed; + return 0; +} + static int uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs, __u64 *data) { regs->ax = 0x12345678deadbeef; - regs->cx = 0x87654321feebdaed; regs->r11 = (u64) -1; - return true; + return 0; } struct testmod_uprobe { @@ -520,6 +526,7 @@ struct testmod_uprobe { static DEFINE_MUTEX(testmod_uprobe_mutex); static struct testmod_uprobe uprobe = { + .consumer.handler = uprobe_handler, .consumer.ret_handler = uprobe_ret_handler, }; diff --git a/tools/testing/selftests/bpf/usdt.h b/tools/testing/selftests/bpf/usdt.h new file mode 100644 index 0000000000000..549d1f7748101 --- /dev/null +++ b/tools/testing/selftests/bpf/usdt.h @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * This single-header library defines a collection of variadic macros for + * defining and triggering USDTs (User Statically-Defined Tracepoints): + * + * - For USDTs without associated semaphore: + * USDT(group, name, args...) + * + * - For USDTs with implicit (transparent to the user) semaphore: + * USDT_WITH_SEMA(group, name, args...) + * USDT_IS_ACTIVE(group, name) + * + * - For USDTs with explicit (user-defined and provided) semaphore: + * USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...) + * USDT_SEMA_IS_ACTIVE(sema) + * + * all of which emit a NOP instruction into the instruction stream, and so + * have *zero* overhead for the surrounding code. USDTs are identified by + * a combination of `group` and `name` identifiers, which is used by external + * tracing tooling (tracers) for identifying exact USDTs of interest. + * + * USDTs can have an associated (2-byte) activity counter (USDT semaphore), + * automatically maintained by Linux kernel whenever any correctly written + * BPF-based tracer is attached to the USDT. This USDT semaphore can be used + * to check whether there is a need to do any extra data collection and + * processing for a given USDT (if necessary), and otherwise avoid extra work + * for a common case of USDT not being traced ("active"). + * + * See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or + * USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on + * working with USDTs with implicitly or explicitly associated + * USDT semaphores, respectively. + * + * There is also some additional data recorded into an auxiliary note + * section. The data in the note section describes the operands, in terms of + * size and location, used by tracing tooling to know where to find USDT + * arguments. Each location is encoded as an assembler operand string. + * Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert + * breakpoints on top of the nop, and decode the location operand-strings, + * like an assembler, to find the values being passed. + * + * The operand strings are selected by the compiler for each operand. + * They are constrained by inline-assembler codes.The default is: + * + * #define USDT_ARG_CONSTRAINT nor + * + * This is a good default if the operands tend to be integral and + * moderate in number (smaller than number of registers). In other + * cases, the compiler may report "'asm' requires impossible reload" or + * similar. In this case, consider simplifying the macro call (fewer + * and simpler operands), reduce optimization, or override the default + * constraints string via: + * + * #define USDT_ARG_CONSTRAINT g + * #include + * + * For some historical description of USDT v3 format (the one used by this + * library and generally recognized and assumed by BPF-based tracing tools) + * see [0]. The more formal specification can be found at [1]. Additional + * argument constraints information can be found at [2]. + * + * Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for + * this USDT library implementation. Current implementation differs *a lot* in + * terms of exposed user API and general usability, which was the main goal + * and focus of the reimplementation work. Nevertheless, underlying recorded + * USDT definitions are fully binary compatible and any USDT-based tooling + * should work equally well with USDTs defined by either SystemTap's or this + * library's USDT implementation. + * + * [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html + * [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation + * [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html + * [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h + */ +#ifndef __USDT_H +#define __USDT_H + +/* + * Changelog: + * + * 0.1.0 + * ----- + * - Initial release + */ +#define USDT_MAJOR_VERSION 0 +#define USDT_MINOR_VERSION 1 +#define USDT_PATCH_VERSION 0 + +/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */ +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L) +#define __usdt_va_opt 1 +#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__ +#else +#define __usdt_va_args(...) , ##__VA_ARGS__ +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. No USDT semaphore is + * associated with this USDT. + * + * Such "semaphoreless" USDTs are commonly used when there is no extra data + * collection or processing needed to collect and prepare USDT arguments and + * they are just available in the surrounding code. USDT() macro will just + * record their locations in CPU registers or in memory for tracing tooling to + * be able to access them, if necessary. + */ +#ifdef __usdt_va_opt +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT(group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__) +#endif + +/* + * Trigger USDT with `group`:`name` identifier and pass through `args` as its + * arguments. Zero arguments are acceptable as well. USDT also get an + * implicitly-defined associated USDT semaphore, which will be "activated" by + * tracing tooling and can be used to check whether USDT is being actively + * observed. + * + * USDTs with semaphore are commonly used when there is a need to perform + * additional data collection and processing to prepare USDT arguments, which + * otherwise might not be necessary for the rest of application logic. In such + * case, USDT semaphore can be used to avoid unnecessary extra work. If USDT + * is not traced (which is presumed to be a common situation), the associated + * USDT semaphore is "inactive", and so there is no need to waste resources to + * prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether + * USDT is "active". + * + * N.B. There is an inherent (albeit short) gap between checking whether USDT + * is active and triggering corresponding USDT, in which external tracer can + * be attached to an USDT and activate USDT semaphore after the activity check. + * If such a race occurs, tracers might miss one USDT execution. Tracers are + * expected to accommodate such possibility and this is expected to not be + * a problem for applications and tracers. + * + * N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained + * within a single executable or shared library and is not shared outside + * them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name + * identifier across executable and shared library, it will work and won't + * conflict, per se, but will define independent USDT semaphores, one for each + * shared library/executable in which USDT_WITH_SEMA(group, name) is used. + * That is, if you attach to this USDT in one shared library (or executable), + * then only USDT semaphore within that shared library (or executable) will be + * updated by the kernel, while other libraries (or executable) will not see + * activated USDT semaphore. In short, it's best to use unique USDT group:name + * identifiers across different shared libraries (and, equivalently, between + * executable and shared library). This is advanced consideration and is + * rarely (if ever) seen in practice, but just to avoid surprises this is + * called out here. (Static libraries become a part of final executable, once + * linked by linker, so the above considerations don't apply to them.) + */ +#ifdef __usdt_va_opt +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name) \ + __VA_OPT__(,) __VA_ARGS__) +#else +#define USDT_WITH_SEMA(group, name, ...) \ + __usdt_probe(group, name, \ + __usdt_sema_implicit, __usdt_sema_name(group, name), \ + ##__VA_ARGS__) +#endif + +struct usdt_sema { volatile unsigned short active; }; + +/* + * Check if USDT with `group`:`name` identifier is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into the given USDT. It is assumed that USDT is triggered with + * USDT_WITH_SEMA() macro which will implicitly define associated USDT + * semaphore. (If one needs more control over USDT semaphore, see + * USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.) + * + * N.B. Such checks are necessarily racy and speculative. Between checking + * whether USDT is active and triggering the USDT itself, tracer can be + * detached with no notification. This race should be extremely rare and worst + * case should result in one-time wasted extra data collection and processing. + */ +#define USDT_IS_ACTIVE(group, name) ({ \ + extern struct usdt_sema __usdt_sema_name(group, name) \ + __usdt_asm_name(__usdt_sema_name(group, name)); \ + __usdt_sema_implicit(__usdt_sema_name(group, name)); \ + __usdt_sema_name(group, name).active > 0; \ +}) + +/* + * APIs for working with user-defined explicit USDT semaphores. + * + * This is a less commonly used advanced API for use cases in which user needs + * an explicit control over (potentially shared across multiple USDTs) USDT + * semaphore instance. This can be used when there is a group of logically + * related USDTs that all need extra data collection and processing whenever + * any of a family of related USDTs are "activated" (i.e., traced). In such + * a case, all such related USDTs will be associated with the same shared USDT + * semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be + * triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra + * USDT semaphore identifier as an extra parameter. + */ + +/** + * Underlying C global variable name for user-defined USDT semaphore with + * `sema` identifier. Could be useful for debugging, but normally shouldn't be + * used explicitly. + */ +#define USDT_SEMA(sema) __usdt_sema_##sema + +/* + * Define storage for user-defined USDT semaphore `sema`. + * + * Should be used only once in non-header source file to let compiler allocate + * space for the semaphore variable. Just like with any other global variable. + * + * This macro can be used anywhere where global variable declaration is + * allowed. Just like with global variable definitions, there should be only + * one definition of user-defined USDT semaphore with given `sema` identifier, + * otherwise compiler or linker will complain about duplicate variable + * definition. + * + * For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace + * and inside namespaces (including nested namespaces). Just make sure that + * USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is + * referenced, or any of its parent namespaces, so the C++ language-level + * identifier is visible to the code that needs to reference the semaphore. + * At the lowest layer, USDT semaphores have global naming and visibility + * (they have a corresponding `__usdt_sema_` symbol, which can be linked + * against from C or C++ code, if necessary). To keep it simple, putting + * USDT_DECLARE_SEMA() declarations into global namespaces is the simplest + * no-brainer solution. All these aspects are irrelevant for plain C, because + * C doesn't have namespaces and everything is always in the global namespace. + * + * N.B. Due to USDT metadata being recorded in non-allocatable ELF note + * section, it has limitations when it comes to relocations, which, in + * practice, means that it's not possible to correctly share USDT semaphores + * between main executable and shared libraries, or even between multiple + * shared libraries. USDT semaphore has to be contained to individual shared + * library or executable to avoid unpleasant surprises with half-working USDT + * semaphores. We enforce this by marking semaphore ELF symbols as having + * a hidden visibility. This is quite an advanced use case and consideration + * and for most users this should have no consequences whatsoever. + */ +#define USDT_DEFINE_SEMA(sema) \ + struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \ + __usdt_asm_name(USDT_SEMA(sema)) \ + __attribute__((visibility("hidden"))) = { 0 } + +/* + * Declare extern reference to user-defined USDT semaphore `sema`. + * + * Refers to a variable defined in another compilation unit by + * USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across + * multiple compilation units (i.e., .c and .cpp files). + * + * See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities. + */ +#define USDT_DECLARE_SEMA(sema) \ + extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema)) + +/* + * Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it + * is attached to by external tracing tooling and is actively observed). + * + * This macro can be used to decide whether any additional and potentially + * expensive data collection or processing should be done to pass extra + * information into USDT(s) associated with USDT semaphore `sema`. + * + * N.B. Such checks are necessarily racy. Between checking the state of USDT + * semaphore and triggering associated USDT(s), the active tracer might attach + * or detach. This race should be extremely rare and worst case should result + * in one-time missed USDT event or wasted extra data collection and + * processing. USDT-using tracers should be written with this in mind and is + * not a concern of the application defining USDTs with associated semaphore. + */ +#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0) + +/* + * Invoke USDT specified by `group` and `name` identifiers and associate + * explicitly user-defined semaphore `sema` with it. Pass through `args` as + * USDT arguments. `args` are optional and zero arguments are acceptable. + * + * Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be + * checked whether active with USDT_SEMA_IS_ACTIVE(). + */ +#ifdef __usdt_va_opt +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__) +#else +#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \ + __usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__) +#endif + +/* + * Adjustable implementation aspects + */ +#ifndef USDT_ARG_CONSTRAINT +#if defined __powerpc__ +#define USDT_ARG_CONSTRAINT nZr +#elif defined __arm__ +#define USDT_ARG_CONSTRAINT g +#elif defined __loongarch__ +#define USDT_ARG_CONSTRAINT nmr +#else +#define USDT_ARG_CONSTRAINT nor +#endif +#endif /* USDT_ARG_CONSTRAINT */ + +#ifndef USDT_NOP +#if defined(__ia64__) || defined(__s390__) || defined(__s390x__) +#define USDT_NOP nop 0 +#else +#define USDT_NOP nop +#endif +#endif /* USDT_NOP */ + +/* + * Implementation details + */ +/* USDT name for implicitly-defined USDT semaphore, derived from group:name */ +#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name +/* ELF section into which USDT semaphores are put */ +#define __usdt_sema_sec __attribute__((section(".probes"))) + +#define __usdt_concat(a, b) a ## b +#define __usdt_apply(fn, n) __usdt_concat(fn, n) + +#ifndef __usdt_nth +#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N +#endif + +#ifndef __usdt_narg +#ifdef __usdt_va_opt +#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#else +#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif +#endif /* __usdt_narg */ + +#define __usdt_hash # +#define __usdt_str_(x) #x +#define __usdt_str(x) __usdt_str_(x) + +#ifndef __usdt_asm_name +#define __usdt_asm_name(name) __asm__(__usdt_str(name)) +#endif + +#define __usdt_asm0() "\n" +#define __usdt_asm1(x) __usdt_str(x) "\n" +#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__) +#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__) +#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__) +#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__) +#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__) +#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__) +#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__) +#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__) +#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__) +#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__) +#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__) +#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#ifdef __LP64__ +#define __usdt_asm_addr .8byte +#else +#define __usdt_asm_addr .4byte +#endif + +#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x) +#define __usdt_asm_strz(x) __usdt_asm_strz_(x) +#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x) +#define __usdt_asm_str(x) __usdt_asm_str_(x) + +/* "semaphoreless" USDT case */ +#ifndef __usdt_sema_none +#define __usdt_sema_none(sema) +#endif + +/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */ +#ifndef __usdt_sema_implicit +#define __usdt_sema_implicit(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm1(.ifndef sema) \ + __usdt_asm3( .pushsection .probes, "aw", "progbits") \ + __usdt_asm1( .weak sema) \ + __usdt_asm1( .hidden sema) \ + __usdt_asm1( .align 2) \ + __usdt_asm1(sema:) \ + __usdt_asm1( .zero 2) \ + __usdt_asm2( .type sema, @object) \ + __usdt_asm2( .size sema, 2) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + ); +#endif + +/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */ +#ifndef __usdt_sema_explicit +#define __usdt_sema_explicit(sema) \ + __asm__ __volatile__ ("" :: "m" (sema)); +#endif + +/* main USDT definition (nop and .note.stapsdt metadata) */ +#define __usdt_probe(group, name, sema_def, sema, ...) do { \ + sema_def(sema) \ + __asm__ __volatile__ ( \ + __usdt_asm( 990: USDT_NOP) \ + __usdt_asm3( .pushsection .note.stapsdt, "", "note") \ + __usdt_asm1( .balign 4) \ + __usdt_asm3( .4byte 992f-991f,994f-993f,3) \ + __usdt_asm1(991: .asciz "stapsdt") \ + __usdt_asm1(992: .balign 4) \ + __usdt_asm1(993: __usdt_asm_addr 990b) \ + __usdt_asm1( __usdt_asm_addr _.stapsdt.base) \ + __usdt_asm1( __usdt_asm_addr sema) \ + __usdt_asm_strz(group) \ + __usdt_asm_strz(name) \ + __usdt_asm_args(__VA_ARGS__) \ + __usdt_asm1( .ascii "\0") \ + __usdt_asm1(994: .balign 4) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.ifndef _.stapsdt.base) \ + __usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\ + __usdt_asm1( .weak _.stapsdt.base) \ + __usdt_asm1( .hidden _.stapsdt.base) \ + __usdt_asm1(_.stapsdt.base:) \ + __usdt_asm1( .space 1) \ + __usdt_asm2( .size _.stapsdt.base, 1) \ + __usdt_asm1( .popsection) \ + __usdt_asm1(.endif) \ + :: __usdt_asm_ops(__VA_ARGS__) \ + ); \ +} while (0) + +/* + * NB: gdb PR24541 highlighted an unspecified corner of the sdt.h + * operand note format. + * + * The named register may be a longer or shorter (!) alias for the + * storage where the value in question is found. For example, on + * i386, 64-bit value may be put in register pairs, and a register + * name stored would identify just one of them. Previously, gcc was + * asked to emit the %w[id] (16-bit alias of some registers holding + * operands), even when a wider 32-bit value was used. + * + * Bottom line: the byte-width given before the @ sign governs. If + * there is a mismatch between that width and that of the named + * register, then a sys/sdt.h note consumer may need to employ + * architecture-specific heuristics to figure out where the compiler + * has actually put the complete value. + */ +#if defined(__powerpc__) || defined(__powerpc64__) +#define __usdt_argref(id) %I[id]%[id] +#elif defined(__i386__) +#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */ +#else +#define __usdt_argref(id) %[id] +#endif + +#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \ + __usdt_asm1(.ascii "@") \ + __usdt_asm_str(__usdt_argref(__usdt_aval##n)) + +#define __usdt_asm_args0 /* no arguments */ +#define __usdt_asm_args1 __usdt_asm_arg(1) +#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2) +#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3) +#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4) +#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5) +#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6) +#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7) +#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8) +#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9) +#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10) +#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11) +#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12) +#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__)) + +#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5) +#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x)) + +/* + * We can't use __builtin_choose_expr() in C++, so fall back to table-based + * signedness determination for known types, utilizing templates magic. + */ +#ifdef __cplusplus + +#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed) + +#include + +template struct __usdt_t { static const bool is_signed = false; }; +template struct __usdt_t : public __usdt_t {}; +template struct __usdt_t : public __usdt_t {}; + +#define __usdt_def_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; }; \ +template<> struct __usdt_t { static const bool is_signed = true; } +#define __usdt_maybe_signed(T) \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; }; \ +template<> struct __usdt_t { static const bool is_signed = (T)-1 < (T)1; } + +__usdt_def_signed(signed char); +__usdt_def_signed(short); +__usdt_def_signed(int); +__usdt_def_signed(long); +__usdt_def_signed(long long); +__usdt_maybe_signed(char); +__usdt_maybe_signed(wchar_t); + +#else /* !__cplusplus */ + +#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4) +#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U)) +#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1) + +#endif /* __cplusplus */ + +#define __usdt_asm_op(n, x) \ + [__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \ + [__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x) + +#define __usdt_asm_ops0() [__usdt_dummy] "g" (0) +#define __usdt_asm_ops1(x) __usdt_asm_op(1, x) +#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x) +#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x) +#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x) +#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x) +#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x) +#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x) +#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x) +#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x) +#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x) +#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x) +#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x) +#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__) + +#endif /* __USDT_H */ diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index b380e102b22f0..169dbd692bf5f 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -77,8 +77,11 @@ static void validate_addr(char *ptr, int high_addr) { unsigned long addr = (unsigned long) ptr; - if (high_addr && addr < HIGH_ADDR_MARK) - ksft_exit_fail_msg("Bad address %lx\n", addr); + if (high_addr) { + if (addr < HIGH_ADDR_MARK) + ksft_exit_fail_msg("Bad address %lx\n", addr); + return; + } if (addr > HIGH_ADDR_MARK) ksft_exit_fail_msg("Bad address %lx\n", addr); diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore index b7dde152e75a9..f6be8efd57eaa 100644 --- a/tools/testing/selftests/ptrace/.gitignore +++ b/tools/testing/selftests/ptrace/.gitignore @@ -3,3 +3,4 @@ get_syscall_info get_set_sud peeksiginfo vmaccess +set_syscall_info diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61acbd45ffaaf..2cf6fc825d865 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -73,6 +73,14 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __nocf_check +#define __nocf_check __attribute__((nocf_check)) +#endif + +#ifndef __naked +#define __naked __attribute__((__naked__)) +#endif + #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 @@ -4896,7 +4904,36 @@ TEST(tsync_vs_dead_thread_leader) EXPECT_EQ(0, status); } -noinline int probed(void) +#ifdef __x86_64__ + +/* + * We need naked probed_uprobe function. Using __nocf_check + * check to skip possible endbr64 instruction and ignoring + * -Wattributes, otherwise the compilation might fail. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +__naked __nocf_check noinline int probed_uprobe(void) +{ + /* + * Optimized uprobe is possible only on top of nop5 instruction. + */ + asm volatile (" \n" + ".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n" + "ret \n" + ); +} +#pragma GCC diagnostic pop + +#else +noinline int probed_uprobe(void) +{ + return 1; +} +#endif + +noinline int probed_uretprobe(void) { return 1; } @@ -4949,35 +4986,46 @@ static ssize_t get_uprobe_offset(const void *addr) return found ? (uintptr_t)addr - start + base : -1; } -FIXTURE(URETPROBE) { +FIXTURE(UPROBE) { int fd; }; -FIXTURE_VARIANT(URETPROBE) { +FIXTURE_VARIANT(UPROBE) { /* - * All of the URETPROBE behaviors can be tested with either - * uretprobe attached or not + * All of the U(RET)PROBE behaviors can be tested with either + * u(ret)probe attached or not */ bool attach; + /* + * Test both uprobe and uretprobe. + */ + bool uretprobe; }; -FIXTURE_VARIANT_ADD(URETPROBE, attached) { +FIXTURE_VARIANT_ADD(UPROBE, not_attached) { + .attach = false, + .uretprobe = false, +}; + +FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) { .attach = true, + .uretprobe = false, }; -FIXTURE_VARIANT_ADD(URETPROBE, not_attached) { - .attach = false, +FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) { + .attach = true, + .uretprobe = true, }; -FIXTURE_SETUP(URETPROBE) +FIXTURE_SETUP(UPROBE) { const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; ssize_t offset; int type, bit; -#ifndef __NR_uretprobe - SKIP(return, "__NR_uretprobe syscall not defined"); +#if !defined(__NR_uprobe) || !defined(__NR_uretprobe) + SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined"); #endif if (!variant->attach) @@ -4987,12 +5035,17 @@ FIXTURE_SETUP(URETPROBE) type = determine_uprobe_perf_type(); ASSERT_GE(type, 0); - bit = determine_uprobe_retprobe_bit(); - ASSERT_GE(bit, 0); - offset = get_uprobe_offset(probed); + + if (variant->uretprobe) { + bit = determine_uprobe_retprobe_bit(); + ASSERT_GE(bit, 0); + } + + offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe); ASSERT_GE(offset, 0); - attr.config |= 1 << bit; + if (variant->uretprobe) + attr.config |= 1 << bit; attr.size = attr_sz; attr.type = type; attr.config1 = ptr_to_u64("/proc/self/exe"); @@ -5003,7 +5056,7 @@ FIXTURE_SETUP(URETPROBE) PERF_FLAG_FD_CLOEXEC); } -FIXTURE_TEARDOWN(URETPROBE) +FIXTURE_TEARDOWN(UPROBE) { /* we could call close(self->fd), but we'd need extra filter for * that and since we are calling _exit right away.. @@ -5017,11 +5070,17 @@ static int run_probed_with_filter(struct sock_fprog *prog) return -1; } - probed(); + /* + * Uprobe is optimized after first hit, so let's hit twice. + */ + probed_uprobe(); + probed_uprobe(); + + probed_uretprobe(); return 0; } -TEST_F(URETPROBE, uretprobe_default_allow) +TEST_F(UPROBE, uprobe_default_allow) { struct sock_filter filter[] = { BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), @@ -5034,7 +5093,7 @@ TEST_F(URETPROBE, uretprobe_default_allow) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block) +TEST_F(UPROBE, uprobe_default_block) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -5051,11 +5110,14 @@ TEST_F(URETPROBE, uretprobe_default_block) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) +TEST_F(UPROBE, uprobe_block_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1), #endif @@ -5070,11 +5132,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall) ASSERT_EQ(0, run_probed_with_filter(&prog)); } -TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall) +TEST_F(UPROBE, uprobe_default_block_with_syscall) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), +#ifdef __NR_uprobe + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0), +#endif #ifdef __NR_uretprobe BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0), #endif diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c index d975a67673299..2eb2c06303f2a 100644 --- a/tools/testing/selftests/syscall_user_dispatch/sud_test.c +++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include "../kselftest_harness.h" @@ -17,11 +19,15 @@ #ifndef PR_SET_SYSCALL_USER_DISPATCH # define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 #endif +#ifndef PR_SYS_DISPATCH_EXCLUSIVE_ON +# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 +# define PR_SYS_DISPATCH_INCLUSIVE_ON 2 +#endif + #ifndef SYS_USER_DISPATCH # define SYS_USER_DISPATCH 2 #endif @@ -65,7 +71,7 @@ TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) ret = sysinfo(&info); ASSERT_EQ(0, ret); - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &sel); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); } @@ -79,6 +85,21 @@ TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) } } +static void prctl_valid(struct __test_metadata *_metadata, + unsigned long op, unsigned long off, + unsigned long size, void *sel) +{ + EXPECT_EQ(0, prctl(PR_SET_SYSCALL_USER_DISPATCH, op, off, size, sel)); +} + +static void prctl_invalid(struct __test_metadata *_metadata, + unsigned long op, unsigned long off, + unsigned long size, void *sel, int err) +{ + EXPECT_EQ(-1, prctl(PR_SET_SYSCALL_USER_DISPATCH, op, off, size, sel)); + EXPECT_EQ(err, errno); +} + TEST(bad_prctl_param) { char sel = SYSCALL_DISPATCH_FILTER_ALLOW; @@ -86,57 +107,54 @@ TEST(bad_prctl_param) /* Invalid op */ op = -1; - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0, 0, &sel); - ASSERT_EQ(EINVAL, errno); + prctl_invalid(_metadata, op, 0, 0, &sel, EINVAL); /* PR_SYS_DISPATCH_OFF */ op = PR_SYS_DISPATCH_OFF; /* offset != 0 */ - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, 0); - EXPECT_EQ(EINVAL, errno); + prctl_invalid(_metadata, op, 0x1, 0x0, 0, EINVAL); /* len != 0 */ - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0xff, 0); - EXPECT_EQ(EINVAL, errno); + prctl_invalid(_metadata, op, 0x0, 0xff, 0, EINVAL); /* sel != NULL */ - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, &sel); - EXPECT_EQ(EINVAL, errno); + prctl_invalid(_metadata, op, 0x0, 0x0, &sel, EINVAL); /* Valid parameter */ - errno = 0; - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, 0x0); - EXPECT_EQ(0, errno); + prctl_valid(_metadata, op, 0x0, 0x0, 0x0); - /* PR_SYS_DISPATCH_ON */ - op = PR_SYS_DISPATCH_ON; + /* PR_SYS_DISPATCH_EXCLUSIVE_ON */ + op = PR_SYS_DISPATCH_EXCLUSIVE_ON; /* Dispatcher region is bad (offset > 0 && len == 0) */ - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, &sel); - EXPECT_EQ(EINVAL, errno); - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, -1L, 0x0, &sel); - EXPECT_EQ(EINVAL, errno); + prctl_invalid(_metadata, op, 0x1, 0x0, &sel, EINVAL); + prctl_invalid(_metadata, op, -1L, 0x0, &sel, EINVAL); /* Invalid selector */ - prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x1, (void *) -1); - ASSERT_EQ(EFAULT, errno); + prctl_invalid(_metadata, op, 0x0, 0x1, (void *) -1, EFAULT); /* * Dispatcher range overflows unsigned long */ - prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 1, -1L, &sel); - ASSERT_EQ(EINVAL, errno) { - TH_LOG("Should reject bad syscall range"); - } + prctl_invalid(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, 1, -1L, &sel, EINVAL); /* * Allowed range overflows usigned long */ - prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, -1L, 0x1, &sel); - ASSERT_EQ(EINVAL, errno) { - TH_LOG("Should reject bad syscall range"); - } + prctl_invalid(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, -1L, 0x1, &sel, EINVAL); + + /* 0 len should fail for PR_SYS_DISPATCH_INCLUSIVE_ON */ + prctl_invalid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, 1, 0, 0, EINVAL); + + /* Range wrap-around should fail */ + prctl_invalid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, -1L, 2, 0, EINVAL); + + /* Normal range shouldn't fail */ + prctl_valid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, 2, 3, 0); + + /* Invalid selector */ + prctl_invalid(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, 2, 3, (void *) -1, EFAULT); } /* @@ -147,11 +165,13 @@ char glob_sel; int nr_syscalls_emulated; int si_code; int si_errno; +unsigned long syscall_addr; static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) { si_code = info->si_code; si_errno = info->si_errno; + syscall_addr = (unsigned long)info->si_call_addr; if (info->si_syscall == MAGIC_SYSCALL_1) nr_syscalls_emulated++; @@ -174,31 +194,34 @@ static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) #endif } -TEST(dispatch_and_return) +int setup_sigsys_handler(void) { - long ret; struct sigaction act; sigset_t mask; - glob_sel = 0; - nr_syscalls_emulated = 0; - si_code = 0; - si_errno = 0; - memset(&act, 0, sizeof(act)); sigemptyset(&mask); - act.sa_sigaction = handle_sigsys; act.sa_flags = SA_SIGINFO; act.sa_mask = mask; + return sigaction(SIGSYS, &act, NULL); +} - ret = sigaction(SIGSYS, &act, NULL); - ASSERT_EQ(0, ret); +TEST(dispatch_and_return) +{ + long ret; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + ASSERT_EQ(0, setup_sigsys_handler()); /* Make sure selector is good prior to prctl. */ SYSCALL_DISPATCH_OFF(glob_sel); - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &glob_sel); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); } @@ -254,7 +277,7 @@ TEST_SIGNAL(bad_selector, SIGSYS) /* Make sure selector is good prior to prctl. */ SYSCALL_DISPATCH_OFF(glob_sel); - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &glob_sel); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); } @@ -278,7 +301,7 @@ TEST(disable_dispatch) struct sysinfo info; char sel = 0; - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, &sel); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); } @@ -310,7 +333,7 @@ TEST(direct_dispatch_range) * Instead of calculating libc addresses; allow the entire * memory map and lock the selector. */ - ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, -1L, &sel); + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, -1L, &sel); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); } @@ -323,4 +346,35 @@ TEST(direct_dispatch_range) } } +static void test_range(struct __test_metadata *_metadata, + unsigned long op, unsigned long off, + unsigned long size, bool dispatch) +{ + nr_syscalls_emulated = 0; + SYSCALL_DISPATCH_OFF(glob_sel); + EXPECT_EQ(0, prctl(PR_SET_SYSCALL_USER_DISPATCH, op, off, size, &glob_sel)); + SYSCALL_DISPATCH_ON(glob_sel); + if (dispatch) { + EXPECT_EQ(syscall(MAGIC_SYSCALL_1), MAGIC_SYSCALL_1); + EXPECT_EQ(nr_syscalls_emulated, 1); + } else { + EXPECT_EQ(syscall(MAGIC_SYSCALL_1), -1); + EXPECT_EQ(nr_syscalls_emulated, 0); + } +} + +TEST(dispatch_range) +{ + ASSERT_EQ(0, setup_sigsys_handler()); + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, 0, 0, true); + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr, 1, false); + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr-100, 200, false); + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr+1, 100, true); + test_range(_metadata, PR_SYS_DISPATCH_EXCLUSIVE_ON, syscall_addr-100, 100, true); + test_range(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, syscall_addr, 1, true); + test_range(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, syscall_addr-1, 1, false); + test_range(_metadata, PR_SYS_DISPATCH_INCLUSIVE_ON, syscall_addr+1, 1, false); + SYSCALL_DISPATCH_OFF(glob_sel); +} + TEST_HARNESS_MAIN