tensorflow · angerson · Jan 6, 2022 · Nov 24, 2021 · Dec 2, 2021 · Dec 3, 2021
@@ -15,7 +15,7 @@ COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
 RUN /build_devtoolset.sh devtoolset-7 /dt7 
 
 ################################################################################
-FROM nvidia/cuda:11.2.1-base-ubuntu20.04 as devel
+FROM nvidia/cuda:11.2.2-base-ubuntu20.04 as devel
 ################################################################################
 
 COPY --from=builder /dt7 /dt7

@@ -22,3 +22,5 @@ export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > "
 export TERM=xterm-256color
 alias grep="grep --color=auto"
 alias ls="ls --color=auto"
+# Fix nvidia-docker
+ldconfig 
@@ -1,10 +1,11 @@
 # All required CUDA packages
 nvidia-profiler
-cuda-11-2
 cuda-command-line-tools-11-2
 cuda-cudart-dev-11-2
 cuda-cupti-11-2
 cuda-nvprune-11-2
+cuda-libraries-11-2
+cuda-libraries-dev-11-2
 libcufft-11-2
 libcurand-11-2
 libcusolver-dev-11-2

@@ -45,3 +45,7 @@ PyYAML ~= 5.4.1
 # For uploading
 auditwheel ~= 5.0.0
 twine ~= 3.6.0
+
+# For user tool scripts
+junitparser ~= 2.2.0
+lxml ~= 4.6.4
@@ -32,11 +32,36 @@ build --copt=-mavx --host_copt=-mavx
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
 build --profile=/tf/pkg/profile.json
 
-# Store the execution log binary in the mounted artifact directory.
-# This log is recommended to debug missing cache hit on the same machine and on different machines.
-# See more at step 2 and 3c in:
-# https://docs.bazel.build/versions/main/remote-caching-debug.html
-build --execution_log_binary_file=/tf/pkg/exec_cpu.log
-
 # Use the NVCC toolchain to compile for manylinux2010
 build [email protected]_manylinux2010-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
+
+# The following test-related settings are experimental.
+
+test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
+test --local_test_jobs=HOST_CPUS
+test --flaky_test_attempts=3 --test_size_filters=small,medium
+test --test_env=LD_LIBRARY_PATH
+test --test_lang_filters=py 
+
+# "nonpip" tests are regular py_test tests.
+test:nonpip --test_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:nonpip --build_tag_filters=-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+test:nonpip -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/...
+
+# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
+# odd to attempt to validate the quality of the pip package. The wheel is
+# installed into a virtual environment, and then that venv is used to run all
+# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
+# drops all the bazel dependencies for each py_test; this makes all the tests
+# use the wheel's TensorFlow installation instead of the one made available
+# through bazel. This must be done in a different root directory, //bazel_pip/...,
+# because "import tensorflow" run from the root directory would instead import
+# the folder instead of the venv package.
+test:pip --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3"
+test:pip --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages"
+test:pip --python_path="/bazel_pip/bin/python3"
+# Yes, we don't exclude the gpu tests on pip for some reason.
+test:pip --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only
+test:pip --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_serial,-v1only
+test:pip --define=no_tensorflow_py_deps=true
+test:pip -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/...
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# Usage: get_test_list.sh OUTPUT BAZEL_TEST_COMMAND...
+# Writes the list of tests that would be run from BAZEL_TEST_COMMAND to OUTPUT.
+# Hides all extra output and always exits with success for now.
+OUTPUT=$1
+shift
+"$@" --check_tests_up_to_date 2>/dev/null | sort -u | awk '{print $1}' | grep "^//" | tee $OUTPUT
@@ -32,12 +32,6 @@ build --copt=-mavx --host_copt=-mavx
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
 build --profile=/tf/pkg/profile.json
 
-# Store the execution log binary in the mounted artifact directory.
-# This log is recommended to debug missing cache hit on the same machine and on different machines.
-# See more at step 2 and 3c in:
-# https://docs.bazel.build/versions/main/remote-caching-debug.html
-build --execution_log_binary_file=/tf/pkg/exec_gpu.log
-
 # CUDA: Set up compilation CUDA version and paths
 build --@local_config_cuda//:enable_cuda
 build --repo_env TF_NEED_CUDA=1
@@ -58,3 +52,35 @@ build --repo_env TF_NEED_TENSORRT=1
 # TODO(angerson, perfinion): What does sm_ vs compute_ mean?
 # TODO(angerson, perfinion): How can users select a good value for this?
 build --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
+
+# The following test-related settings are experimental.
+
+test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
+test --flaky_test_attempts=3 --test_size_filters=small,medium
+test --test_env=LD_LIBRARY_PATH
+test --test_lang_filters=py 
+# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
+test --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
+
+# "nonpip" tests are regular py_test tests.
+test:nonpip --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11
+test:nonpip --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11
+test:nonpip -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/...
+
+# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
+# odd to attempt to validate the quality of the pip package. The wheel is
+# installed into a virtual environment, and then that venv is used to run all
+# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
+# drops all the bazel dependencies for each py_test; this makes all the tests
+# use the wheel's TensorFlow installation instead of the one made available
+# through bazel. This must be done in a different root directory, //bazel_pip/...,
+# because "import tensorflow" run from the root directory would instead import
+# the folder instead of the venv package.
+test:pip --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3"
+test:pip --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages"
+test:pip --python_path="/bazel_pip/bin/python3"
+# Yes, we don't exclude the gpu tests on pip for some reason.
+test:pip --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip
+test:pip --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_cuda11,-no_pip,-nopip
+test:pip --define=no_tensorflow_py_deps=true
+test:pip -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/...
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+# Run this from inside the tensorflow github directory.
+# Usage: setup_venv_test.sh venv_and_symlink_name "glob pattern for one wheel file"
+# Example: setup_venv_test.sh bazel_pip "/tf/pkg/*.whl"
+# 
+# This will create a venv with that wheel file installed in it, and a symlink
+# in ./venv_and_symlink_name/tensorflow to ./tensorflow. We use this for the
+# "pip" tests.
+
+python -m venv /$1
+mkdir -p $1
+rm -f ./$1/tensorflow
+ln -s $(ls /$1/lib) /$1/lib/python3
+ln -s ../tensorflow $1/tensorflow
+# extglob is necessary for @(a|b) pattern matching
+# see "extglob" in the bash manual page ($ man bash)
+bash -O extglob -c "/$1/bin/pip install $2"
+/$1/bin/pip install -r /usertools/test.requirements.txt
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+#
+# Usage: squash_testlogs.py START_DIRECTORY OUTPUT_FILE
+#
+# Example: squash_testlogs.py /tf/pkg/testlogs /tf/pkg/merged.xml
+#
+# Recursively find all the JUnit test.xml files in one directory, and merge any
+# of them that contain failures into one file. The TensorFlow DevInfra team
+# uses this to generate a simple overview of an entire pip and nonpip test
+# invocation, since the normal logs that Bazel creates are too large for the
+# internal invocation viewer.
+import glob
+import os
+import sys
+from junitparser import JUnitXml
+from lxml import etree
+import subprocess
+import re
+
+result = JUnitXml()
+try:
+  files = subprocess.check_output(["grep", "-rlE", '(failures|errors)="[1-9]', sys.argv[1]])
+except subprocess.CalledProcessError as e:
+  print("No failures found to log!")
+  exit(0)
+
+# For test cases, only show the ones that failed that have text (a log)
+seen = set()
+
+for f in files.strip().splitlines():
+  # Include only "test.xml" files, as "attempt_x" files repeat the same thing.
+  if not f.endswith(b"test.xml"):
+    continue
+  # Just ignore any failures, they're probably not important
+  try:
+    r = JUnitXml.fromfile(f)
+  except Exception as e: 
+    print("Ignoring this XML parse failure in {}: ".format(f), str(e))
+
+  for testsuite in r:
+    # Remove empty testcases
+    for p in testsuite._elem.xpath('.//testcase'):
+      if not len(p):
+        testsuite._elem.remove(p)
+    # Convert "testsuite > testcase,system-out" to "testsuite > testcase"
+    for p in testsuite._elem.xpath('.//system-out'):
+      for c in p.getparent().xpath('.//error | .//failure'):
+        c.text = p.text
+      p.getparent().remove(p)
+    # Include a note about 
+    for p in testsuite._elem.xpath('.//error | .//failure'):
+      short_name = re.search(r'/(bazel_pip|tensorflow)/.*', f.decode("utf-8")).group(0)
+      p.text += f"\nNOTE: From /{short_name}"
+      if "bazel_pip" in short_name:
+        p.text += "\nNOTE: This was a pip test. Remove 'bazel_pip' to find the real target."
+    # Remove this testsuite if it doesn't have anything in it any more
+    if len(testsuite) == 0:
+      r._elem.remove(testsuite._elem)
+  if len(r) > 0:
+    result += r
+
+os.makedirs(os.path.dirname(sys.argv[2]), exist_ok=True)
+result.update_statistics()
+result.write(sys.argv[2])
@@ -0,0 +1,6 @@
+# Test dependencies for pip tests
+grpcio ~= 1.42.0
+portpicker ~= 1.4.0
+scipy ~= 1.5.4
+jax ~= 0.2.18
+jaxlib ~= 0.1.70