diff --git a/tf_sig_build_dockerfiles/Dockerfile b/tf_sig_build_dockerfiles/Dockerfile index a9796b57..98bbed9d 100644 --- a/tf_sig_build_dockerfiles/Dockerfile +++ b/tf_sig_build_dockerfiles/Dockerfile @@ -15,7 +15,7 @@ COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh RUN /build_devtoolset.sh devtoolset-7 /dt7 ################################################################################ -FROM nvidia/cuda:11.2.1-base-ubuntu20.04 as devel +FROM nvidia/cuda:11.2.2-base-ubuntu20.04 as devel ################################################################################ COPY --from=builder /dt7 /dt7 diff --git a/tf_sig_build_dockerfiles/devel.bashrc b/tf_sig_build_dockerfiles/devel.bashrc index c3c8928d..755d4878 100644 --- a/tf_sig_build_dockerfiles/devel.bashrc +++ b/tf_sig_build_dockerfiles/devel.bashrc @@ -22,3 +22,5 @@ export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > " export TERM=xterm-256color alias grep="grep --color=auto" alias ls="ls --color=auto" +# Fix nvidia-docker +ldconfig diff --git a/tf_sig_build_dockerfiles/devel.packages.txt b/tf_sig_build_dockerfiles/devel.packages.txt index fa761d93..7e327e83 100644 --- a/tf_sig_build_dockerfiles/devel.packages.txt +++ b/tf_sig_build_dockerfiles/devel.packages.txt @@ -1,10 +1,11 @@ # All required CUDA packages nvidia-profiler -cuda-11-2 cuda-command-line-tools-11-2 cuda-cudart-dev-11-2 cuda-cupti-11-2 cuda-nvprune-11-2 +cuda-libraries-11-2 +cuda-libraries-dev-11-2 libcufft-11-2 libcurand-11-2 libcusolver-dev-11-2 diff --git a/tf_sig_build_dockerfiles/devel.requirements.txt b/tf_sig_build_dockerfiles/devel.requirements.txt index 8d59bcf3..663e0171 100644 --- a/tf_sig_build_dockerfiles/devel.requirements.txt +++ b/tf_sig_build_dockerfiles/devel.requirements.txt @@ -45,3 +45,7 @@ PyYAML ~= 5.4.1 # For uploading auditwheel ~= 5.0.0 twine ~= 3.6.0 + +# For user tool scripts +junitparser ~= 2.2.0 +lxml ~= 4.6.4 diff --git a/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc b/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc index 8c3bd2e3..30060a21 100644 --- a/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc +++ b/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc @@ -32,11 +32,36 @@ build --copt=-mavx --host_copt=-mavx # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling build --profile=/tf/pkg/profile.json -# Store the execution log binary in the mounted artifact directory. -# This log is recommended to debug missing cache hit on the same machine and on different machines. -# See more at step 2 and 3c in: -# https://docs.bazel.build/versions/main/remote-caching-debug.html -build --execution_log_binary_file=/tf/pkg/exec_cpu.log - # Use the NVCC toolchain to compile for manylinux2010 build --crosstool_top=@ubuntu18.04-gcc7_manylinux2010-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain + +# The following test-related settings are experimental. + +test --build_tests_only --keep_going --test_output=errors --verbose_failures=true +test --local_test_jobs=HOST_CPUS +test --flaky_test_attempts=3 --test_size_filters=small,medium +test --test_env=LD_LIBRARY_PATH +test --test_lang_filters=py + +# "nonpip" tests are regular py_test tests. +test:nonpip --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only +test:nonpip --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only +test:nonpip -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... + +# "pip tests" run a similar suite of tests the "nonpip" tests, but do something +# odd to attempt to validate the quality of the pip package. The wheel is +# installed into a virtual environment, and then that venv is used to run all +# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which +# drops all the bazel dependencies for each py_test; this makes all the tests +# use the wheel's TensorFlow installation instead of the one made available +# through bazel. This must be done in a different root directory, //bazel_pip/..., +# because "import tensorflow" run from the root directory would instead import +# the folder instead of the venv package. +test:pip --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3" +test:pip --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages" +test:pip --python_path="/bazel_pip/bin/python3" +# Yes, we don't exclude the gpu tests on pip for some reason. +test:pip --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only +test:pip --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only +test:pip --define=no_tensorflow_py_deps=true +test:pip -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... diff --git a/tf_sig_build_dockerfiles/devel.usertools/get_test_list.sh b/tf_sig_build_dockerfiles/devel.usertools/get_test_list.sh new file mode 100755 index 00000000..a2ec9f41 --- /dev/null +++ b/tf_sig_build_dockerfiles/devel.usertools/get_test_list.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Usage: get_test_list.sh OUTPUT BAZEL_TEST_COMMAND... +# Writes the list of tests that would be run from BAZEL_TEST_COMMAND to OUTPUT. +# Hides all extra output and always exits with success for now. +OUTPUT=$1 +shift +"$@" --check_tests_up_to_date 2>/dev/null | sort -u | awk '{print $1}' | grep "^//" | tee $OUTPUT diff --git a/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc b/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc index 971c9c9b..9125325e 100644 --- a/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc +++ b/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc @@ -32,12 +32,6 @@ build --copt=-mavx --host_copt=-mavx # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling build --profile=/tf/pkg/profile.json -# Store the execution log binary in the mounted artifact directory. -# This log is recommended to debug missing cache hit on the same machine and on different machines. -# See more at step 2 and 3c in: -# https://docs.bazel.build/versions/main/remote-caching-debug.html -build --execution_log_binary_file=/tf/pkg/exec_gpu.log - # CUDA: Set up compilation CUDA version and paths build --@local_config_cuda//:enable_cuda build --repo_env TF_NEED_CUDA=1 @@ -58,3 +52,35 @@ build --repo_env TF_NEED_TENSORRT=1 # TODO(angerson, perfinion): What does sm_ vs compute_ mean? # TODO(angerson, perfinion): How can users select a good value for this? build --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80" + +# The following test-related settings are experimental. + +test --build_tests_only --keep_going --test_output=errors --verbose_failures=true +test --flaky_test_attempts=3 --test_size_filters=small,medium +test --test_env=LD_LIBRARY_PATH +test --test_lang_filters=py +# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think +test --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute + +# "nonpip" tests are regular py_test tests. +test:nonpip --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11 +test:nonpip --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11 +test:nonpip -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... + +# "pip tests" run a similar suite of tests the "nonpip" tests, but do something +# odd to attempt to validate the quality of the pip package. The wheel is +# installed into a virtual environment, and then that venv is used to run all +# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which +# drops all the bazel dependencies for each py_test; this makes all the tests +# use the wheel's TensorFlow installation instead of the one made available +# through bazel. This must be done in a different root directory, //bazel_pip/..., +# because "import tensorflow" run from the root directory would instead import +# the folder instead of the venv package. +test:pip --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3" +test:pip --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages" +test:pip --python_path="/bazel_pip/bin/python3" +# Yes, we don't exclude the gpu tests on pip for some reason. +test:pip --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip +test:pip --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip +test:pip --define=no_tensorflow_py_deps=true +test:pip -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... diff --git a/tf_sig_build_dockerfiles/devel.usertools/setup_venv_test.sh b/tf_sig_build_dockerfiles/devel.usertools/setup_venv_test.sh new file mode 100755 index 00000000..ee7212cd --- /dev/null +++ b/tf_sig_build_dockerfiles/devel.usertools/setup_venv_test.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euxo pipefail + +# Run this from inside the tensorflow github directory. +# Usage: setup_venv_test.sh venv_and_symlink_name "glob pattern for one wheel file" +# Example: setup_venv_test.sh bazel_pip "/tf/pkg/*.whl" +# +# This will create a venv with that wheel file installed in it, and a symlink +# in ./venv_and_symlink_name/tensorflow to ./tensorflow. We use this for the +# "pip" tests. + +python -m venv /$1 +mkdir -p $1 +rm -f ./$1/tensorflow +ln -s $(ls /$1/lib) /$1/lib/python3 +ln -s ../tensorflow $1/tensorflow +# extglob is necessary for @(a|b) pattern matching +# see "extglob" in the bash manual page ($ man bash) +bash -O extglob -c "/$1/bin/pip install $2" +/$1/bin/pip install -r /usertools/test.requirements.txt diff --git a/tf_sig_build_dockerfiles/devel.usertools/squash_testlogs.py b/tf_sig_build_dockerfiles/devel.usertools/squash_testlogs.py new file mode 100755 index 00000000..13303d7f --- /dev/null +++ b/tf_sig_build_dockerfiles/devel.usertools/squash_testlogs.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# +# Usage: squash_testlogs.py START_DIRECTORY OUTPUT_FILE +# +# Example: squash_testlogs.py /tf/pkg/testlogs /tf/pkg/merged.xml +# +# Recursively find all the JUnit test.xml files in one directory, and merge any +# of them that contain failures into one file. The TensorFlow DevInfra team +# uses this to generate a simple overview of an entire pip and nonpip test +# invocation, since the normal logs that Bazel creates are too large for the +# internal invocation viewer. +import glob +import os +import sys +from junitparser import JUnitXml +from lxml import etree +import subprocess +import re + +result = JUnitXml() +try: + files = subprocess.check_output(["grep", "-rlE", '(failures|errors)="[1-9]', sys.argv[1]]) +except subprocess.CalledProcessError as e: + print("No failures found to log!") + exit(0) + +# For test cases, only show the ones that failed that have text (a log) +seen = set() + +for f in files.strip().splitlines(): + # Include only "test.xml" files, as "attempt_x" files repeat the same thing. + if not f.endswith(b"test.xml"): + continue + # Just ignore any failures, they're probably not important + try: + r = JUnitXml.fromfile(f) + except Exception as e: + print("Ignoring this XML parse failure in {}: ".format(f), str(e)) + + for testsuite in r: + # Remove empty testcases + for p in testsuite._elem.xpath('.//testcase'): + if not len(p): + testsuite._elem.remove(p) + # Convert "testsuite > testcase,system-out" to "testsuite > testcase" + for p in testsuite._elem.xpath('.//system-out'): + for c in p.getparent().xpath('.//error | .//failure'): + c.text = p.text + p.getparent().remove(p) + # Include a note about + for p in testsuite._elem.xpath('.//error | .//failure'): + short_name = re.search(r'/(bazel_pip|tensorflow)/.*', f.decode("utf-8")).group(0) + p.text += f"\nNOTE: From /{short_name}" + if "bazel_pip" in short_name: + p.text += "\nNOTE: This was a pip test. Remove 'bazel_pip' to find the real target." + # Remove this testsuite if it doesn't have anything in it any more + if len(testsuite) == 0: + r._elem.remove(testsuite._elem) + if len(r) > 0: + result += r + +os.makedirs(os.path.dirname(sys.argv[2]), exist_ok=True) +result.update_statistics() +result.write(sys.argv[2]) diff --git a/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt b/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt new file mode 100644 index 00000000..84edd981 --- /dev/null +++ b/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt @@ -0,0 +1,6 @@ +# Test dependencies for pip tests +grpcio ~= 1.42.0 +portpicker ~= 1.4.0 +scipy ~= 1.5.4 +jax ~= 0.2.18 +jaxlib ~= 0.1.70