From d87583ba6845c598f14f2af0791422d4c231e813 Mon Sep 17 00:00:00 2001 From: per1234 Date: Fri, 1 Apr 2022 04:58:31 -0700 Subject: [PATCH] [skip changelog] Remove broken download stats workflow The "arduino-stats" GitHub Actions workflow was designed to periodically gather download statistics from Arduino CDN and push results to Datadog. The recorded stats showed a periodic decrease in total download count. Since this is patently impossible, it is clear that something was wrong with the system and that the recorded data was not usable. An investigation into the problem was never done. On 2022-03-14, the runs of the "arduino-stats" GitHub Actions workflow began to fail. Because there had not been any relevant change in the repository between the last successful run and the first failing run, it seems that some external change caused the breakage. Since the workflow was not ever working successfully and the lack of an investigation about that indicates that the stats are not of immediate importance, the best course of action is to simply remove the broken infrastructure from the repository rather than investing time into fixing something that isn't being used anyway. --- .github/tools/fetch_athena_stats.py | 135 --------------------------- .github/workflows/arduino-stats.yaml | 57 ----------- 2 files changed, 192 deletions(-) delete mode 100644 .github/tools/fetch_athena_stats.py delete mode 100644 .github/workflows/arduino-stats.yaml diff --git a/.github/tools/fetch_athena_stats.py b/.github/tools/fetch_athena_stats.py deleted file mode 100644 index 047585538ba..00000000000 --- a/.github/tools/fetch_athena_stats.py +++ /dev/null @@ -1,135 +0,0 @@ -import boto3 -import semver -import os -import logging -import uuid -import time - - -# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -log = logging.getLogger() -logging.getLogger("boto3").setLevel(logging.CRITICAL) -logging.getLogger("botocore").setLevel(logging.CRITICAL) -logging.getLogger("urllib3").setLevel(logging.CRITICAL) - - -def execute(client, statement, dest_s3_output_location): - log.info("execute query: {} dumping in {}".format(statement, dest_s3_output_location)) - result = client.start_query_execution( - QueryString=statement, - ClientRequestToken=str(uuid.uuid4()), - ResultConfiguration={ - "OutputLocation": dest_s3_output_location, - }, - ) - execution_id = result["QueryExecutionId"] - log.info("wait for query {} completion".format(execution_id)) - wait_for_query_execution_completion(client, execution_id) - log.info("operation successful") - return execution_id - - -def wait_for_query_execution_completion(client, query_execution_id): - query_ended = False - while not query_ended: - query_execution = client.get_query_execution(QueryExecutionId=query_execution_id) - state = query_execution["QueryExecution"]["Status"]["State"] - if state == "SUCCEEDED": - query_ended = True - elif state in ["FAILED", "CANCELLED"]: - raise BaseException( - "query failed or canceled: {}".format(query_execution["QueryExecution"]["Status"]["StateChangeReason"]) - ) - else: - time.sleep(1) - - -def valid(key): - split = key.split("_") - if len(split) < 1: - return False - try: - semver.parse(split[0]) - except ValueError: - return False - return True - - -def get_results(client, execution_id): - results_paginator = client.get_paginator("get_query_results") - results_iter = results_paginator.paginate(QueryExecutionId=execution_id, PaginationConfig={"PageSize": 1000}) - res = {} - for results_page in results_iter: - for row in results_page["ResultSet"]["Rows"][1:]: - # Loop through the JSON objects - key = row["Data"][0]["VarCharValue"] - if valid(key): - res[key] = row["Data"][1]["VarCharValue"] - - return res - - -def convert_data(data): - result = [] - for key, value in data.items(): - # 0.18.0_macOS_64bit.tar.gz - split_key = key.split("_") - if len(split_key) != 3: - continue - (version, os_version, arch) = split_key - arch_split = arch.split(".") - if len(arch_split) < 1: - continue - arch = arch_split[0] - if len(arch) > 10: - # This can't be an architecture really. - # It's an ugly solution but works for now so deal with it. - continue - repo = os.environ["GITHUB_REPOSITORY"].split("/")[1] - result.append( - { - "type": "gauge", - "name": "arduino.downloads.total", - "value": value, - "host": os.environ["GITHUB_REPOSITORY"], - "tags": [ - f"version:{version}", - f"os:{os_version}", - f"arch:{arch}", - "cdn:downloads.arduino.cc", - f"project:{repo}", - ], - } - ) - - return result - - -if __name__ == "__main__": - DEST_S3_OUTPUT = os.environ["AWS_ATHENA_OUTPUT_LOCATION"] - AWS_ATHENA_SOURCE_TABLE = os.environ["AWS_ATHENA_SOURCE_TABLE"] - - session = boto3.session.Session(region_name="us-east-1") - athena_client = session.client("athena") - - # Load all partitions before querying downloads - execute(athena_client, f"MSCK REPAIR TABLE {AWS_ATHENA_SOURCE_TABLE};", DEST_S3_OUTPUT) - - query = f"""SELECT replace(json_extract_scalar(url_decode(url_decode(querystring)), -'$.data.url'), 'https://downloads.arduino.cc/arduino-cli/arduino-cli_', '') -AS flavor, count(json_extract(url_decode(url_decode(querystring)),'$')) AS gauge -FROM {AWS_ATHENA_SOURCE_TABLE} -WHERE json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') -LIKE 'https://downloads.arduino.cc/arduino-cli/arduino-cli_%' -AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') -NOT LIKE '%latest%' -- exclude latest redirect -AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') -NOT LIKE '%alpha%' -- exclude early alpha releases -AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') -NOT LIKE '%.tar.bz2%' -- exclude very old releases archive formats -group by 1 ;""" - exec_id = execute(athena_client, query, DEST_S3_OUTPUT) - results = get_results(athena_client, exec_id) - result_json = convert_data(results) - - print(f"::set-output name=result::{result_json}") diff --git a/.github/workflows/arduino-stats.yaml b/.github/workflows/arduino-stats.yaml deleted file mode 100644 index 9b336fee11b..00000000000 --- a/.github/workflows/arduino-stats.yaml +++ /dev/null @@ -1,57 +0,0 @@ -name: arduino-stats - -on: - schedule: - # run every day at 07:00 AM, 03:00 PM and 11:00 PM - - cron: "0 7,15,23 * * *" - workflow_dispatch: - repository_dispatch: - -jobs: - push-stats: - # This workflow is only of value to the arduino/arduino-cli repository and - # would always fail in forks - if: github.repository == 'arduino/arduino-cli' - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v3 - - - uses: actions/setup-python@v3 - with: - python-version: "3.x" - - - name: Fetch downloads count form Arduino CDN using AWS Athena - id: fetch - env: - AWS_ACCESS_KEY_ID: ${{ secrets.STATS_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.STATS_AWS_SECRET_ACCESS_KEY }} - AWS_ATHENA_SOURCE_TABLE: ${{ secrets.STATS_AWS_ATHENA_SOURCE_TABLE }} - AWS_ATHENA_OUTPUT_LOCATION: ${{ secrets.STATS_AWS_ATHENA_OUTPUT_LOCATION }} - GITHUB_REPOSITORY: ${{ github.repository }} - run: | - pip install boto3 semver - python .github/tools/fetch_athena_stats.py - - - name: Send metrics - uses: masci/datadog@v1 - with: - api-key: ${{ secrets.DD_API_KEY }} - # Metrics input expects YAML but JSON will work just right. - metrics: ${{steps.fetch.outputs.result}} - - - name: Report failure - if: failure() - uses: masci/datadog@v1 - with: - api-key: ${{ secrets.DD_API_KEY }} - events: | - - title: "Arduino CLI stats failing" - text: "Stats collection failed" - alert_type: "error" - host: ${{ github.repository }} - tags: - - "project:arduino-cli" - - "cdn:downloads.arduino.cc" - - "workflow:${{ github.workflow }}"