diff --git a/.cloud-build/CheckPythonVersion.py b/.cloud-build/CheckPythonVersion.py new file mode 100644 index 00000000000..06fc6e4ae2e --- /dev/null +++ b/.cloud-build/CheckPythonVersion.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# # Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys + +MINIMUM_MAJOR_VERSION = 3 +MINIMUM_MINOR_VERSION = 5 + +if ( + sys.version_info.major >= MINIMUM_MAJOR_VERSION + or sys.version_info.minor >= MINIMUM_MINOR_VERSION +): + print(f"Python version acceptable: {sys.version}") + exit(0) +else: + print( + f"Error: Python version less than {MINIMUM_MAJOR_VERSION}.{MINIMUM_MINOR_VERSION}" + ) + exit(1) diff --git a/.cloud-build/cleanup/cleanup-cloudbuild.yaml b/.cloud-build/cleanup/cleanup-cloudbuild.yaml new file mode 100644 index 00000000000..8f83c9644c2 --- /dev/null +++ b/.cloud-build/cleanup/cleanup-cloudbuild.yaml @@ -0,0 +1,8 @@ +steps: + # Install Python dependencies and run cleanup script + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'python3 -m pip install -U -r .cloud-build/cleanup/cleanup-requirements.txt && python3 .cloud-build/cleanup/cleanup.py' +timeout: 86400s diff --git a/.cloud-build/cleanup/cleanup.py b/.cloud-build/cleanup/cleanup.py new file mode 100644 index 00000000000..e5d34507461 --- /dev/null +++ b/.cloud-build/cleanup/cleanup.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# # Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List +from resource_cleanup_manager import ( + ResourceCleanupManager, + DatasetResourceCleanupManager, + EndpointResourceCleanupManager, + ModelResourceCleanupManager, +) + + +def run_cleanup_managers(managers: List[ResourceCleanupManager], is_dry_run: bool): + for manager in managers: + type_name = manager.type_name + + print(f"Fetching {type_name}'s...") + resources = manager.list() + print(f"Found {len(resources)} {type_name}'s") + for resource in resources: + if not manager.is_deletable(resource): + continue + + if is_dry_run: + resource_name = manager.resource_name(resource) + print(f"Will delete '{type_name}': {resource_name}") + else: + try: + manager.delete(resource) + except Exception as exception: + print(exception) + + +is_dry_run = False + +# List of all cleanup managers +managers = [ + DatasetResourceCleanupManager(), + EndpointResourceCleanupManager(), + ModelResourceCleanupManager(), +] + +run_cleanup_managers(managers=managers, is_dry_run=is_dry_run) diff --git a/.cloud-build/cleanup/requirements.txt b/.cloud-build/cleanup/requirements.txt new file mode 100644 index 00000000000..a08de87d409 --- /dev/null +++ b/.cloud-build/cleanup/requirements.txt @@ -0,0 +1 @@ +google-cloud-aiplatform==1.12.1 \ No newline at end of file diff --git a/.cloud-build/cleanup/resource_cleanup_manager.py b/.cloud-build/cleanup/resource_cleanup_manager.py new file mode 100644 index 00000000000..3f4c7f344ef --- /dev/null +++ b/.cloud-build/cleanup/resource_cleanup_manager.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# # Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +from google.cloud import aiplatform +from typing import Any +from proto.datetime_helpers import DatetimeWithNanoseconds +from google.cloud.aiplatform import base + +# If a resource was updated within this number of seconds, do not delete. +RESOURCE_UPDATE_BUFFER_IN_SECONDS = 60 * 60 * 8 + + +class ResourceCleanupManager(abc.ABC): + @property + @abc.abstractmethod + def type_name(str) -> str: + pass + + @abc.abstractmethod + def list(self) -> Any: + pass + + @abc.abstractmethod + def resource_name(self, resource: Any) -> str: + pass + + @abc.abstractmethod + def delete(self, resource: Any): + pass + + @abc.abstractmethod + def get_seconds_since_modification(self, resource: Any) -> float: + pass + + def is_deletable(self, resource: Any) -> bool: + time_difference = self.get_seconds_since_modification(resource) + + if self.resource_name(resource).startswith("perm"): + print(f"Skipping '{resource}' due to name starting with 'perm'.") + return False + + # Check that it wasn't created too recently, to prevent race conditions + if time_difference <= RESOURCE_UPDATE_BUFFER_IN_SECONDS: + print( + f"Skipping '{resource}' due update_time being '{time_difference}', which is less than '{RESOURCE_UPDATE_BUFFER_IN_SECONDS}'." + ) + return False + + return True + + +class VertexAIResourceCleanupManager(ResourceCleanupManager): + @property + @abc.abstractmethod + def vertex_ai_resource(self) -> base.VertexAiResourceNounWithFutureManager: + pass + + @property + def type_name(self) -> str: + return self.vertex_ai_resource._resource_noun + + def list(self) -> Any: + return self.vertex_ai_resource.list() + + def resource_name(self, resource: Any) -> str: + return resource.display_name + + def delete(self, resource): + resource.delete() + + def get_seconds_since_modification(self, resource: Any) -> bool: + update_time = resource.update_time + current_time = DatetimeWithNanoseconds.now(tz=update_time.tzinfo) + return (current_time - update_time).total_seconds() + + +class DatasetResourceCleanupManager(VertexAIResourceCleanupManager): + vertex_ai_resource = aiplatform.datasets._Dataset + + +class EndpointResourceCleanupManager(VertexAIResourceCleanupManager): + vertex_ai_resource = aiplatform.Endpoint + + def delete(self, resource): + resource.delete(force=True) + + +class ModelResourceCleanupManager(VertexAIResourceCleanupManager): + vertex_ai_resource = aiplatform.Model diff --git a/.cloud-build/execute_changed_notebooks_cli.py b/.cloud-build/execute_changed_notebooks_cli.py new file mode 100644 index 00000000000..84d1de387d2 --- /dev/null +++ b/.cloud-build/execute_changed_notebooks_cli.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A CLI to process changed notebooks and execute them on Google Cloud Build""" + +import argparse +import pathlib +import execute_changed_notebooks_helper + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("String value expected.") + + +parser = argparse.ArgumentParser(description="Run changed notebooks.") +parser.add_argument( + "--test_paths_file", + type=pathlib.Path, + help="The path to the file that has newline-limited folders of notebooks that should be tested.", + required=True, +) +parser.add_argument( + "--base_branch", + help="The base git branch to diff against to find changed files.", + required=False, +) +parser.add_argument( + "--container_uri", + type=str, + help="The container uri to run each notebook in.", + required=True, +) +parser.add_argument( + "--variable_project_id", + type=str, + help="The GCP project id. This is used to inject a variable value into the notebook before running.", + required=True, +) +parser.add_argument( + "--variable_region", + type=str, + help="The GCP region. This is used to inject a variable value into the notebook before running.", + required=True, +) +parser.add_argument( + "--staging_bucket", + type=str, + help="The GCS bucket for staging temporary files.", + required=True, +) +parser.add_argument( + "--artifacts_bucket", + type=str, + help="The GCP directory for storing executed notebooks.", + required=True, +) +parser.add_argument( + "--should_parallelize", + type=str2bool, + nargs="?", + const=True, + default=True, + help="Should run notebooks in parallel.", +) + +args = parser.parse_args() + +notebooks = execute_changed_notebooks_helper.get_changed_notebooks( + test_paths_file=args.test_paths_file, + base_branch=args.base_branch, +) + +execute_changed_notebooks_helper.process_and_execute_notebooks( + notebooks=notebooks, + container_uri=args.container_uri, + staging_bucket=args.staging_bucket, + artifacts_bucket=args.artifacts_bucket, + variable_project_id=args.variable_project_id, + variable_region=args.variable_region, + should_parallelize=args.should_parallelize, +) diff --git a/.cloud-build/execute_changed_notebooks_helper.py b/.cloud-build/execute_changed_notebooks_helper.py new file mode 100644 index 00000000000..f454205f755 --- /dev/null +++ b/.cloud-build/execute_changed_notebooks_helper.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import concurrent +import dataclasses +import datetime +import functools +import os +import pathlib +import nbformat +import re +import subprocess +from typing import List, Optional +from tabulate import tabulate +import operator + +import execute_notebook_remote +from utils import util, NotebookProcessors +from google.cloud.devtools.cloudbuild_v1.types import BuildOperationMetadata + + +def format_timedelta(delta: datetime.timedelta) -> str: + """Formats a timedelta duration to [N days] %H:%M:%S format""" + seconds = int(delta.total_seconds()) + + secs_in_a_day = 86400 + secs_in_a_hour = 3600 + secs_in_a_min = 60 + + days, seconds = divmod(seconds, secs_in_a_day) + hours, seconds = divmod(seconds, secs_in_a_hour) + minutes, seconds = divmod(seconds, secs_in_a_min) + + time_fmt = f"{hours:02d}:{minutes:02d}:{seconds:02d}" + + if days > 0: + suffix = "s" if days > 1 else "" + return f"{days} day{suffix} {time_fmt}" + + return time_fmt + + +@dataclasses.dataclass +class NotebookExecutionResult: + name: str + duration: datetime.timedelta + is_pass: bool + log_url: str + output_uri: str + build_id: str + error_message: Optional[str] + + +def _process_notebook( + notebook_path: str, + variable_project_id: str, + variable_region: str, +): + # Read notebook + with open(notebook_path) as f: + nb = nbformat.read(f, as_version=4) + + # Create preprocessors + remove_no_execute_cells_preprocessor = NotebookProcessors.RemoveNoExecuteCells() + update_variables_preprocessor = NotebookProcessors.UpdateVariablesPreprocessor( + replacement_map={"PROJECT_ID": variable_project_id, "REGION": variable_region}, + ) + + # Use no-execute preprocessor + ( + nb, + resources, + ) = remove_no_execute_cells_preprocessor.preprocess(nb) + + (nb, resources) = update_variables_preprocessor.preprocess(nb, resources) + + with open(notebook_path, mode="w", encoding="utf-8") as new_file: + nbformat.write(nb, new_file) + + +def _create_tag(filepath: str) -> str: + tag = os.path.basename(os.path.normpath(filepath)) + tag = re.sub("[^0-9a-zA-Z_.-]+", "-", tag) + + if tag.startswith(".") or tag.startswith("-"): + tag = tag[1:] + + return tag + + +def process_and_execute_notebook( + container_uri: str, + staging_bucket: str, + artifacts_bucket: str, + variable_project_id: str, + variable_region: str, + notebook: str, + should_get_tail_logs: bool = False, +) -> NotebookExecutionResult: + print(f"Running notebook: {notebook}") + + # Create paths + notebook_output_uri = "/".join([artifacts_bucket, pathlib.Path(notebook).name]) + + # Create tag from notebook + tag = _create_tag(filepath=notebook) + + result = NotebookExecutionResult( + name=tag, + duration=datetime.timedelta(seconds=0), + is_pass=False, + output_uri=notebook_output_uri, + log_url="", + build_id="", + error_message=None, + ) + + # TODO: Handle cases where multiple notebooks have the same name + time_start = datetime.datetime.now() + operation = None + try: + # Pre-process notebook by substituting variable names + _process_notebook( + notebook_path=notebook, + variable_project_id=variable_project_id, + variable_region=variable_region, + ) + + # Upload the pre-processed code to a GCS bucket + code_archive_uri = util.archive_code_and_upload(staging_bucket=staging_bucket) + + operation = execute_notebook_remote.execute_notebook_remote( + code_archive_uri=code_archive_uri, + notebook_uri=notebook, + notebook_output_uri=notebook_output_uri, + container_uri=container_uri, + tag=tag, + ) + + operation_metadata = BuildOperationMetadata(mapping=operation.metadata) + result.build_id = operation_metadata.build.id + result.log_url = operation_metadata.build.log_url + + # Block and wait for the result + operation.result() + + result.duration = datetime.datetime.now() - time_start + result.is_pass = True + print(f"{notebook} PASSED in {format_timedelta(result.duration)}.") + except Exception as error: + result.error_message = str(error) + + if operation and should_get_tail_logs: + # Extract the logs + logs_bucket = operation_metadata.build.logs_bucket + + # Download tail end of logs file + log_file_uri = f"{logs_bucket}/log-{result.build_id}.txt" + + # Use gcloud to get tail + try: + result.error_message = subprocess.check_output( + ["gsutil", "cat", "-r", "-1000", log_file_uri], encoding="UTF-8" + ) + except Exception as error: + result.error_message = str(error) + + result.duration = datetime.datetime.now() - time_start + result.is_pass = False + + print( + f"{notebook} FAILED in {format_timedelta(result.duration)}: {result.error_message}" + ) + + return result + + +def get_changed_notebooks( + test_paths_file: str, + base_branch: Optional[str] = None, +) -> List[str]: + """ + Get the notebooks that exist under the folders defined in the test_paths_file. + It only returns notebooks that have differences from the Git base_branch. + """ + + test_paths = [] + with open(test_paths_file) as file: + lines = [line.strip() for line in file.readlines()] + lines = [line for line in lines if len(line) > 0] + test_paths = [line for line in lines] + + if len(test_paths) == 0: + raise RuntimeError("No test folders found.") + + print(f"Checking folders: {test_paths}") + + # Find notebooks + notebooks = [] + if base_branch: + print(f"Looking for notebooks that changed from branch: {base_branch}") + notebooks = subprocess.check_output( + ["git", "diff", "--name-only", f"origin/{base_branch}..."] + test_paths + ) + else: + print("Looking for all notebooks.") + notebooks = subprocess.check_output(["git", "ls-files"] + test_paths) + + notebooks = notebooks.decode("utf-8").split("\n") + notebooks = [notebook for notebook in notebooks if notebook.endswith(".ipynb")] + notebooks = [notebook for notebook in notebooks if len(notebook) > 0] + notebooks = [notebook for notebook in notebooks if pathlib.Path(notebook).exists()] + + return notebooks + + +def process_and_execute_notebooks( + notebooks: List[str], + container_uri: str, + staging_bucket: str, + artifacts_bucket: str, + variable_project_id: str, + variable_region: str, + should_parallelize: bool, +): + """ + Run the notebooks that exist under the folders defined in the test_paths_file. + It only runs notebooks that have differences from the Git base_branch. + The executed notebooks are saved in the artifacts_bucket. + Variables are also injected into the notebooks such as the variable_project_id and variable_region. + Args: + test_paths_file (str): + Required. The new-line delimited file to folders and files that need checking. + Folders are checked recursively. + base_branch (str): + Optional. If provided, only the files that have changed from the base_branch will be checked. + If not provided, all files will be checked. + staging_bucket (str): + Required. The GCS staging bucket to write source code to. + artifacts_bucket (str): + Required. The GCS staging bucket to write executed notebooks to. + variable_project_id (str): + Required. The value for PROJECT_ID to inject into notebooks. + variable_region (str): + Required. The value for REGION to inject into notebooks. + should_parallelize (bool): + Required. Should run notebooks in parallel using a thread pool as opposed to in sequence. + """ + notebook_execution_results: List[NotebookExecutionResult] = [] + + if len(notebooks) > 0: + print(f"Found {len(notebooks)} modified notebooks: {notebooks}") + + if should_parallelize and len(notebooks) > 1: + print( + "Running notebooks in parallel, so no logs will be displayed. Please wait..." + ) + with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor: + notebook_execution_results = list( + executor.map( + functools.partial( + process_and_execute_notebook, + container_uri, + staging_bucket, + artifacts_bucket, + variable_project_id, + variable_region, + ), + notebooks, + ) + ) + else: + notebook_execution_results = [ + process_and_execute_notebook( + container_uri=container_uri, + staging_bucket=staging_bucket, + artifacts_bucket=artifacts_bucket, + variable_project_id=variable_project_id, + variable_region=variable_region, + notebook=notebook, + ) + for notebook in notebooks + ] + else: + print("No notebooks modified in this pull request.") + + print("\n=== RESULTS ===\n") + + results_sorted = sorted( + notebook_execution_results, + key=lambda result: result.is_pass, + reverse=True, + ) + + # Print results + print( + tabulate( + [ + [ + result.name, + "PASSED" if result.is_pass else "FAILED", + format_timedelta(result.duration), + result.log_url, + ] + for result in results_sorted + ], + headers=["build_tag", "status", "duration", "log_url"], + ) + ) + + print("\n=== END RESULTS===\n") + + total_notebook_duration = functools.reduce( + operator.add, + [datetime.timedelta(seconds=0)] + + [result.duration for result in results_sorted], + ) + + print(f"Cumulative notebook duration: {format_timedelta(total_notebook_duration)}") + + # Raise error if any notebooks failed + if not all([result.is_pass for result in results_sorted]): + raise RuntimeError("Notebook failures detected. See logs for details") diff --git a/.cloud-build/execute_notebook_cli.py b/.cloud-build/execute_notebook_cli.py new file mode 100644 index 00000000000..9545f9c4cd7 --- /dev/null +++ b/.cloud-build/execute_notebook_cli.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""A CLI to download (optional) and run a single notebook locally""" + +import argparse +import execute_notebook_helper + +parser = argparse.ArgumentParser(description="Run a single notebook locally.") +parser.add_argument( + "--notebook_source", + type=str, + help="Local filepath or GCS URI to notebook.", + required=True, +) +parser.add_argument( + "--output_file_or_uri", + type=str, + help="Local file or GCS URI to save executed notebook to.", + required=True, +) + +args = parser.parse_args() +execute_notebook_helper.execute_notebook( + notebook_source=args.notebook_source, + output_file_or_uri=args.output_file_or_uri, + should_log_output=True, +) diff --git a/.cloud-build/execute_notebook_helper.py b/.cloud-build/execute_notebook_helper.py new file mode 100644 index 00000000000..d59b7b61680 --- /dev/null +++ b/.cloud-build/execute_notebook_helper.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Methods to run a notebook locally""" + +import sys +import os +import errno +import papermill as pm +import shutil + +from utils import util +from google.cloud.aiplatform import utils + +# This script is used to execute a notebook and write out the output notebook. + + +def execute_notebook( + notebook_source: str, + output_file_or_uri: str, + should_log_output: bool, +): + """Execute a single notebook using Papermill""" + file_name = os.path.basename(os.path.normpath(notebook_source)) + + # Download notebook if it's a GCS URI + if notebook_source.startswith("gs://"): + # Extract uri components + bucket_name, prefix = utils.extract_bucket_and_prefix_from_gcs_path( + notebook_source + ) + + # Download remote notebook to local file system + notebook_source = file_name + util.download_file( + bucket_name=bucket_name, blob_name=prefix, destination_file=notebook_source + ) + + execution_exception = None + + # Execute notebook + try: + # Execute notebook + pm.execute_notebook( + input_path=notebook_source, + output_path=notebook_source, + progress_bar=should_log_output, + request_save_on_cell_execute=should_log_output, + log_output=should_log_output, + stdout_file=sys.stdout if should_log_output else None, + stderr_file=sys.stderr if should_log_output else None, + ) + except Exception as exception: + execution_exception = exception + finally: + # Copy executed notebook + if output_file_or_uri.startswith("gs://"): + # Upload to GCS path + util.upload_file(notebook_source, remote_file_path=output_file_or_uri) + + print("\n=== EXECUTION FINISHED ===\n") + print( + f"Please debug the executed notebook by downloading: {output_file_or_uri}" + ) + print("\n======\n") + else: + # Create directories if they don't exist + if not os.path.exists(os.path.dirname(output_file_or_uri)): + try: + os.makedirs(os.path.dirname(output_file_or_uri)) + except OSError as exc: # Guard against race condition + if exc.errno != errno.EEXIST: + raise + + print(f"Writing output to: {output_file_or_uri}") + shutil.move(notebook_source, output_file_or_uri) + + if execution_exception: + raise execution_exception diff --git a/.cloud-build/execute_notebook_remote.py b/.cloud-build/execute_notebook_remote.py new file mode 100644 index 00000000000..fc4bc6411fc --- /dev/null +++ b/.cloud-build/execute_notebook_remote.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Methods to run a notebook on Google Cloud Build""" + +from google.protobuf import duration_pb2 +from yaml.loader import FullLoader + +import google.auth +from google.cloud.devtools import cloudbuild_v1 +from google.cloud.devtools.cloudbuild_v1.types import Source, StorageSource + +from typing import Optional +import yaml + +from google.cloud.aiplatform import utils +from google.api_core import operation + +CLOUD_BUILD_FILEPATH = ".cloud-build/notebook-execution-test-cloudbuild-single.yaml" +TIMEOUT_IN_SECONDS = 86400 + + +def execute_notebook_remote( + code_archive_uri: str, + notebook_uri: str, + notebook_output_uri: str, + container_uri: str, + tag: Optional[str], +) -> operation.Operation: + """Create and execute a single notebook on Google Cloud Build""" + + # Authorize the client with Google defaults + credentials, project_id = google.auth.default() + client = cloudbuild_v1.services.cloud_build.CloudBuildClient() + + build = cloudbuild_v1.Build() + + # The following build steps will output "hello world" + # For more information on build configuration, see + # https://cloud.google.com/build/docs/configuring-builds/create-basic-configuration + cloudbuild_config = yaml.load(open(CLOUD_BUILD_FILEPATH), Loader=FullLoader) + + substitutions = { + "_PYTHON_IMAGE": container_uri, + "_NOTEBOOK_GCS_URI": notebook_uri, + "_NOTEBOOK_OUTPUT_GCS_URI": notebook_output_uri, + } + + ( + source_archived_file_gcs_bucket, + source_archived_file_gcs_object, + ) = utils.extract_bucket_and_prefix_from_gcs_path(code_archive_uri) + + build.source = Source( + storage_source=StorageSource( + bucket=source_archived_file_gcs_bucket, + object_=source_archived_file_gcs_object, + ) + ) + + build.steps = cloudbuild_config["steps"] + build.substitutions = substitutions + build.timeout = duration_pb2.Duration(seconds=TIMEOUT_IN_SECONDS) + build.queue_ttl = duration_pb2.Duration(seconds=TIMEOUT_IN_SECONDS) + + if tag: + build.tags = [tag] + + operation = client.create_build(project_id=project_id, build=build) + # Print the in-progress operation + # TODO(developer): Uncomment next two lines + # print("IN PROGRESS:") + # print(operation.metadata) + + # Print the completed status + # TODO(developer): Uncomment next line + # print("RESULT:", result.status) + return operation diff --git a/.cloud-build/notebook-execution-test-cloudbuild-single.yaml b/.cloud-build/notebook-execution-test-cloudbuild-single.yaml new file mode 100644 index 00000000000..5e12755b13b --- /dev/null +++ b/.cloud-build/notebook-execution-test-cloudbuild-single.yaml @@ -0,0 +1,31 @@ +steps: + # Show the gcloud info and check if gcloud exists + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'gcloud config list' + # Check the Python version + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'python3 .cloud-build/CheckPythonVersion.py' + # Install Python dependencies + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'python3 -m pip install -U pip && python3 -m pip install -U --user -r .cloud-build/requirements.txt' + # Install Python dependencies and run testing script + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'python3 -m pip install -U pip && python3 -m pip freeze && python3 .cloud-build/execute_notebook_cli.py --notebook_source "${_NOTEBOOK_GCS_URI}" --output_file_or_uri "${_NOTEBOOK_OUTPUT_GCS_URI}"' + env: + - 'IS_TESTING=1' +timeout: 86400s +options: + pool: + name: ${_PRIVATE_POOL_NAME} \ No newline at end of file diff --git a/.cloud-build/notebook-execution-test-cloudbuild.yaml b/.cloud-build/notebook-execution-test-cloudbuild.yaml new file mode 100644 index 00000000000..4d9d847569a --- /dev/null +++ b/.cloud-build/notebook-execution-test-cloudbuild.yaml @@ -0,0 +1,41 @@ +steps: + # Show the gcloud info and check if gcloud exists + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'gcloud config list' + # # Clone the Git repo + # - name: ${_PYTHON_IMAGE} + # entrypoint: git + # args: ['clone', "${_GIT_REPO}", "--branch", "${_GIT_BRANCH_NAME}", "."] + # Check the Python version + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'python3 .cloud-build/CheckPythonVersion.py' + # Fetch base branch if required + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'if [ -n "${_BASE_BRANCH}" ]; then git fetch origin "${_BASE_BRANCH}":refs/remotes/origin/"${_BASE_BRANCH}"; else echo "Skipping fetch."; fi' + # Install Python dependencies + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'python3 -m pip install -U pip && python3 -m pip install -U --user -r .cloud-build/requirements.txt' + # Install Python dependencies and run testing script + - name: ${_PYTHON_IMAGE} + entrypoint: /bin/sh + args: + - -c + - 'python3 -m pip install -U pip && python3 -m pip freeze && python3 .cloud-build/execute_changed_notebooks_cli.py --test_paths_file "${_TEST_PATHS_FILE}" --base_branch "${_FORCED_BASE_BRANCH}" --container_uri ${_PYTHON_IMAGE} --staging_bucket ${_GCS_STAGING_BUCKET} --artifacts_bucket ${_GCS_STAGING_BUCKET}/executed_notebooks/PR_${_PR_NUMBER}/BUILD_${BUILD_ID} --variable_project_id ${PROJECT_ID} --variable_region ${_GCP_REGION}' + env: + - 'IS_TESTING=1' +timeout: 86400s +options: + pool: + name: ${_PRIVATE_POOL_NAME} \ No newline at end of file diff --git a/.cloud-build/requirements.txt b/.cloud-build/requirements.txt new file mode 100644 index 00000000000..33579836a3d --- /dev/null +++ b/.cloud-build/requirements.txt @@ -0,0 +1,12 @@ +ipython==8.3.0 +jupyter==1.0 +nbconvert==6.5.0 +papermill==2.3 +numpy==1.22.3 +pandas==1.4.2 +matplotlib==3.5.2 +tabulate==0.8.9 +google-cloud-aiplatform +google-cloud-storage +google-cloud-build +gcloud \ No newline at end of file diff --git a/.cloud-build/test_folders.txt b/.cloud-build/test_folders.txt new file mode 100644 index 00000000000..6eb63c85fba --- /dev/null +++ b/.cloud-build/test_folders.txt @@ -0,0 +1 @@ +notebooks/official \ No newline at end of file diff --git a/.cloud-build/utils/NotebookProcessors.py b/.cloud-build/utils/NotebookProcessors.py new file mode 100644 index 00000000000..d445750e003 --- /dev/null +++ b/.cloud-build/utils/NotebookProcessors.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nbconvert.preprocessors import Preprocessor +from typing import Dict +from . import UpdateNotebookVariables as update_notebook_variables + + +class RemoveNoExecuteCells(Preprocessor): + def preprocess(self, notebook, resources=None): + executable_cells = [] + for cell in notebook.cells: + if cell.metadata.get("tags"): + if "no_execute" in cell.metadata.get("tags"): + continue + executable_cells.append(cell) + notebook.cells = executable_cells + return notebook, resources + + +class UpdateVariablesPreprocessor(Preprocessor): + def __init__(self, replacement_map: Dict): + self._replacement_map = replacement_map + + @staticmethod + def update_variables(content: str, replacement_map: Dict[str, str]): + # replace variables inside .ipynb files + # looking for this format inside notebooks: + # VARIABLE_NAME = '[description]' + + for variable_name, variable_value in replacement_map.items(): + content = update_notebook_variables.get_updated_value( + content=content, + variable_name=variable_name, + variable_value=variable_value, + ) + + return content + + def preprocess(self, notebook, resources=None): + executable_cells = [] + for cell in notebook.cells: + if cell.cell_type == "code": + cell.source = self.update_variables( + content=cell.source, + replacement_map=self._replacement_map, + ) + + executable_cells.append(cell) + notebook.cells = executable_cells + return notebook, resources diff --git a/.cloud-build/utils/UpdateNotebookVariables.py b/.cloud-build/utils/UpdateNotebookVariables.py new file mode 100644 index 00000000000..c602e490374 --- /dev/null +++ b/.cloud-build/utils/UpdateNotebookVariables.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +""" + This script is used to update variables in the notebook via regex + It requires variables to be defined in particular format +For example, if your variable was PROJECT_ID, use: + PROJECT_ID = "[your_project_here]" +Single-quotes also work: + PROJECT_ID = '[your_project_here]' +Variables in conditionals can also be replaced: + PROJECT_ID == "[your_project_here]" +""" + + +def get_updated_value(content: str, variable_name: str, variable_value: str) -> str: + return re.sub( + rf"({variable_name}.*?=.*?[\",\'])\[.+?\]([\",\'].*?)", + rf"\1{variable_value}\2", + content, + flags=re.M, + ) + + +def test_update_value(): + new_content = get_updated_value( + content='asdf\nPROJECT_ID = "[your-project-id]" #@param {type:"string"} \nasdf', + variable_name="PROJECT_ID", + variable_value="sample-project", + ) + assert ( + new_content + == 'asdf\nPROJECT_ID = "sample-project" #@param {type:"string"} \nasdf' + ) + + +def test_update_value_single_quotes(): + new_content = get_updated_value( + content="PROJECT_ID = '[your-project-id]'", + variable_name="PROJECT_ID", + variable_value="sample-project", + ) + assert new_content == "PROJECT_ID = 'sample-project'" + + +def test_update_value_avoidance(): + new_content = get_updated_value( + content="PROJECT_ID = shell_output[0] ", + variable_name="PROJECT_ID", + variable_value="sample-project", + ) + assert new_content == "PROJECT_ID = shell_output[0] " + + +def test_region(): + new_content = get_updated_value( + content='REGION = "[your-region]" # @param {type:"string"}', + variable_name="REGION", + variable_value="us-central1", + ) + assert new_content == 'REGION = "us-central1" # @param {type:"string"}' diff --git a/.cloud-build/utils/__init__.py b/.cloud-build/utils/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/.cloud-build/utils/utils.py b/.cloud-build/utils/utils.py new file mode 100644 index 00000000000..b4978e8b410 --- /dev/null +++ b/.cloud-build/utils/utils.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import subprocess +import tarfile +import uuid + + +def download_file(bucket_name: str, blob_name: str, destination_file: str) -> str: + """Copies a remote GCS file to a local path""" + remote_file_path = "".join(["gs://", "/".join([bucket_name, blob_name])]) + + subprocess.check_output( + ["gsutil", "cp", remote_file_path, destination_file], encoding="UTF-8" + ) + + return destination_file + + +def upload_file( + local_file_path: str, + remote_file_path: str, +) -> str: + """Copies a local file to a GCS path""" + subprocess.check_output( + ["gsutil", "cp", local_file_path, remote_file_path], encoding="UTF-8" + ) + + return remote_file_path + + +def archive_code_and_upload(staging_bucket: str): + # Archive all source in current directory + unique_id = uuid.uuid4() + source_archived_file = f"source_archived_{unique_id}.tar.gz" + + git_files = subprocess.check_output( + ["git", "ls-tree", "-r", "HEAD", "--name-only"], encoding="UTF-8" + ).split("\n") + + with tarfile.open(source_archived_file, "w:gz") as tar: + for file in git_files: + if len(file) > 0 and os.path.exists(file): + tar.add(file) + + # Upload archive to GCS bucket + source_archived_file_gcs = upload_file( + local_file_path=f"{source_archived_file}", + remote_file_path="/".join( + [staging_bucket, "code_archives", source_archived_file] + ), + ) + + print(f"Uploaded source code archive to {source_archived_file_gcs}") + + return source_archived_file_gcs diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml new file mode 100644 index 00000000000..b5d483329df --- /dev/null +++ b/.github/.OwlBot.lock.yaml @@ -0,0 +1,17 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +docker: + image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest + digest: sha256:ee038415ddc542590737f5e476fab963c6a1367223aeffcde41991a93ab37d49 +# created: 2022-04-21T15:43:16.246106921Z diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 00000000000..2997b1c1ce2 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,36 @@ +name: ci +on: pull_request + +jobs: + format_and_lint: + name: notebook format and lint + runs-on: ubuntu-latest + steps: + - name: Set up Python + uses: actions/setup-python@v3 + - name: Fetch pull request branch + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Fetch base main branch + run: git fetch -u "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY" main:main + - name: Install requirements + run: python3 -m pip install -U -r .github/workflows/linter/requirements.txt + - name: Format and lint notebooks + run: | + set +e + + .github/workflows/linter/run_linter.sh -t + RTN=$? + + if [ "$RTN" != "0" ]; then + echo "There were problems formatting/linting the notebooks." + echo "Please run the following commands locally from the root directory to attempt to autofix the issues:" + echo "" + echo "python3 -m pip install -U -r .github/workflows/linter/requirements.txt" + echo ".github/workflows/linter/run_linter.sh" + echo "" + echo "If it can't be autofixed, please fix them manually." + echo "Then, commit the fixes and push again." + exit 1 + fi \ No newline at end of file diff --git a/.github/workflows/linter/requirements.txt b/.github/workflows/linter/requirements.txt new file mode 100644 index 00000000000..2d7591ce9b9 --- /dev/null +++ b/.github/workflows/linter/requirements.txt @@ -0,0 +1,9 @@ +git+https://github.com/tensorflow/docs +ipython +jupyter +nbconvert +black==22.3.0 +pyupgrade==2.32.0 +isort==5.10.1 +flake8==4.0.1 +nbqa==1.3.1 \ No newline at end of file diff --git a/.github/workflows/linter/run_linter.sh b/.github/workflows/linter/run_linter.sh new file mode 100644 index 00000000000..102c33a513a --- /dev/null +++ b/.github/workflows/linter/run_linter.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script automatically formats and lints all notebooks that have changed from the head of the main branch. +# +# Options: +# -t: Test-mode. Only test if format and linting are required but make no changes to files. +# +# Returns: +# This script will return 0 if linting was successful/unneeded and 1 if there were any errors. + +# `+e` enables the script to continue even when a command fails +set +e + +# `-o pipefail` sets the exit code to the rightmost comment to exit with a non-zero +set -o pipefail + +# Use RTN to return a non-zero value if the test fails. +RTN="0" + +is_test=false + +# Process all options supplied on the command line +while getopts 'tc' arg; do + case $arg in + 't') + is_test=true + ;; + *) + echo "Unimplemented flag" + exit 1 + ;; + esac +done + +echo "Test mode: $is_test" + +# Only check notebooks in test folders modified in this pull request. +# Note: Use process substitution to persist the data in the array +notebooks=() +while read -r file || [ -n "$line" ]; do + notebooks+=("$file") +done < <(git diff --name-only main... | grep '\.ipynb$') + +problematic_notebooks=() +if [ ${#notebooks[@]} -gt 0 ]; then + for notebook in "${notebooks[@]}"; do + if [ -f "$notebook" ]; then + echo "Checking notebook: ${notebook}" + + NBFMT_RTN="0" + BLACK_RTN="0" + PYUPGRADE_RTN="0" + ISORT_RTN="0" + FLAKE8_RTN="0" + + if [ "$is_test" = true ]; then + echo "Running nbfmt..." + python3 -m tensorflow_docs.tools.nbfmt --remove_outputs --test "$notebook" + NBFMT_RTN=$? + # echo "Running black..." + # python3 -m nbqa black "$notebook" --check + # BLACK_RTN=$? + echo "Running pyupgrade..." + python3 -m nbqa pyupgrade "$notebook" + PYUPGRADE_RTN=$? + echo "Running isort..." + python3 -m nbqa isort "$notebook" --check + ISORT_RTN=$? + echo "Running flake8..." + python3 -m nbqa flake8 "$notebook" --show-source --extend-ignore=W391,E501,F821,E402,F404,W503,E203,E722,W293,W291 + FLAKE8_RTN=$? + else + echo "Running black..." + python3 -m nbqa black "$notebook" + BLACK_RTN=$? + echo "Running pyupgrade..." + python3 -m nbqa pyupgrade "$notebook" + PYUPGRADE_RTN=$? + echo "Running isort..." + python3 -m nbqa isort "$notebook" + ISORT_RTN=$? + echo "Running nbfmt..." + python3 -m tensorflow_docs.tools.nbfmt --remove_outputs "$notebook" + NBFMT_RTN=$? + echo "Running flake8..." + python3 -m nbqa flake8 "$notebook" --show-source --extend-ignore=W391,E501,F821,E402,F404,W503,E203,E722,W293,W291 + FLAKE8_RTN=$? + fi + + NOTEBOOK_RTN="0" + + if [ "$NBFMT_RTN" != "0" ]; then + NOTEBOOK_RTN="$NBFMT_RTN" + printf "nbfmt: Failed\n" + fi + + if [ "$BLACK_RTN" != "0" ]; then + NOTEBOOK_RTN="$BLACK_RTN" + printf "black: Failed\n" + fi + + if [ "$PYUPGRADE_RTN" != "0" ]; then + NOTEBOOK_RTN="$PYUPGRADE_RTN" + printf "pyupgrade: Failed\n" + fi + + if [ "$ISORT_RTN" != "0" ]; then + NOTEBOOK_RTN="$ISORT_RTN" + printf "isort: Failed\n" + fi + + if [ "$FLAKE8_RTN" != "0" ]; then + NOTEBOOK_RTN="$FLAKE8_RTN" + printf "flake8: Failed\n" + fi + + echo "Notebook lint finished with return code = $NOTEBOOK_RTN" + echo "" + if [ "$NOTEBOOK_RTN" != "0" ]; then + problematic_notebooks+=("$notebook") + RTN=$NOTEBOOK_RTN + fi + fi + done +else + echo "No notebooks modified in this pull request." +fi + +echo "All tests finished. Exiting with return code = $RTN" + +if [ ${#problematic_notebooks[@]} -gt 0 ]; then + echo "The following notebooks could not be automatically linted:" + printf '%s\n' "${problematic_notebooks[@]}" +fi + +exit "$RTN" \ No newline at end of file diff --git a/.repo-metadata.json b/.repo-metadata.json new file mode 100644 index 00000000000..54d282a7a69 --- /dev/null +++ b/.repo-metadata.json @@ -0,0 +1,5 @@ +{ + "language": "python", + "library_type": "OTHER", + "repo": "GoogleCloudPlatform/python-docs-samples" +} \ No newline at end of file diff --git a/owlbot.py b/owlbot.py new file mode 100644 index 00000000000..9dd60cf9b55 --- /dev/null +++ b/owlbot.py @@ -0,0 +1,17 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from synthtool.languages import python + +python.python_notebooks_testing_pipeline() \ No newline at end of file diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000000..c6a0411d4df --- /dev/null +++ b/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "python-docs-samples", + "lockfileVersion": 2, + "requires": true, + "packages": {} +}