From 162bf2fe77115bfdbe3e2a232be1445d8dd66ebc Mon Sep 17 00:00:00 2001 From: per1234 Date: Sun, 30 Oct 2022 16:31:23 -0700 Subject: [PATCH 1/2] Remove broken download stats workflow The "arduino-stats" GitHub Actions workflow was designed to periodically gather download statistics from Arduino CDN and push results to Datadog. The recorded stats from the identical system in the Arduino CLI repository showed a periodic decrease in total download count. Since this is patently impossible, it is clear that something is wrong with the system and that the recorded data is not trustworthy. An investigation into the problem was never done. On 2022-03-14, the runs of the "arduino-stats" GitHub Actions workflow began to fail. Because there had not been any relevant change in the repository between the last successful run and the first failing run, it seems that some external change caused the breakage. The workflow also uses deprecated Node.js 12 runtime-based actions and set-output workflow command, which currently results in warnings printed to the workflow run summary page, but will eventually cause the complete breakage of the workflow. Since the workflow was not ever working successfully and the lack of an investigation about that indicates that the stats are not of immediate importance, the best course of action is to simply remove the broken infrastructure from the repository rather than investing time into fixing something that isn't being used anyway. --- .github/tools/fetch_athena_stats.py | 131 ---------------------------- .github/workflows/arduino-stats.yml | 57 ------------ 2 files changed, 188 deletions(-) delete mode 100644 .github/tools/fetch_athena_stats.py delete mode 100644 .github/workflows/arduino-stats.yml diff --git a/.github/tools/fetch_athena_stats.py b/.github/tools/fetch_athena_stats.py deleted file mode 100644 index 1f3140359..000000000 --- a/.github/tools/fetch_athena_stats.py +++ /dev/null @@ -1,131 +0,0 @@ -import boto3 -import semver -import os -import logging -import uuid -import time - - -# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -log = logging.getLogger() -logging.getLogger("boto3").setLevel(logging.CRITICAL) -logging.getLogger("botocore").setLevel(logging.CRITICAL) -logging.getLogger("urllib3").setLevel(logging.CRITICAL) - - -def execute(client, statement, dest_s3_output_location): - log.info("execute query: {} dumping in {}".format(statement, dest_s3_output_location)) - result = client.start_query_execution( - QueryString=statement, - ClientRequestToken=str(uuid.uuid4()), - ResultConfiguration={ - "OutputLocation": dest_s3_output_location, - }, - ) - execution_id = result["QueryExecutionId"] - log.info("wait for query {} completion".format(execution_id)) - wait_for_query_execution_completion(client, execution_id) - log.info("operation successful") - return execution_id - - -def wait_for_query_execution_completion(client, query_execution_id): - query_ended = False - while not query_ended: - query_execution = client.get_query_execution(QueryExecutionId=query_execution_id) - state = query_execution["QueryExecution"]["Status"]["State"] - if state == "SUCCEEDED": - query_ended = True - elif state in ["FAILED", "CANCELLED"]: - raise BaseException( - "query failed or canceled: {}".format(query_execution["QueryExecution"]["Status"]["StateChangeReason"]) - ) - else: - time.sleep(1) - - -def valid(key): - split = key.split("_") - if len(split) < 1: - return False - try: - semver.parse(split[0]) - except ValueError: - return False - return True - - -def get_results(client, execution_id): - results_paginator = client.get_paginator("get_query_results") - results_iter = results_paginator.paginate(QueryExecutionId=execution_id, PaginationConfig={"PageSize": 1000}) - res = {} - for results_page in results_iter: - for row in results_page["ResultSet"]["Rows"][1:]: - # Loop through the JSON objects - key = row["Data"][0]["VarCharValue"] - if valid(key): - res[key] = row["Data"][1]["VarCharValue"] - - return res - - -def convert_data(data): - result = [] - for key, value in data.items(): - # 0.18.0_macOS_64bit.tar.gz - split_key = key.split("_") - if len(split_key) != 3: - continue - (version, os_version, arch) = split_key - arch_split = arch.split(".") - if len(arch_split) < 1: - continue - arch = arch_split[0] - if len(arch) > 10: - # This can't be an architecture really. - # It's an ugly solution but works for now so deal with it. - continue - repo = os.environ["GITHUB_REPOSITORY"].split("/")[1] - result.append( - { - "type": "gauge", - "name": "arduino.downloads.total", - "value": value, - "host": os.environ["GITHUB_REPOSITORY"], - "tags": [ - f"version:{version}", - f"os:{os_version}", - f"arch:{arch}", - "cdn:downloads.arduino.cc", - f"project:{repo}", - ], - } - ) - - return result - - -if __name__ == "__main__": - DEST_S3_OUTPUT = os.environ["AWS_ATHENA_OUTPUT_LOCATION"] - AWS_ATHENA_SOURCE_TABLE = os.environ["AWS_ATHENA_SOURCE_TABLE"] - - session = boto3.session.Session(region_name="us-east-1") - athena_client = session.client("athena") - - # Load all partitions before querying downloads - execute(athena_client, f"MSCK REPAIR TABLE {AWS_ATHENA_SOURCE_TABLE};", DEST_S3_OUTPUT) - - query = f"""SELECT replace(json_extract_scalar(url_decode(url_decode(querystring)), -'$.data.url'), 'https://downloads.arduino.cc/arduino-ide/arduino-ide_', '') -AS flavor, count(json_extract(url_decode(url_decode(querystring)),'$')) AS gauge -FROM {AWS_ATHENA_SOURCE_TABLE} -WHERE json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') -LIKE 'https://downloads.arduino.cc/arduino-ide/arduino-ide_%' -AND json_extract_scalar(url_decode(url_decode(querystring)),'$.data.url') -NOT LIKE '%latest%' -- exclude latest redirect -group by 1 ;""" - exec_id = execute(athena_client, query, DEST_S3_OUTPUT) - results = get_results(athena_client, exec_id) - result_json = convert_data(results) - - print(f"::set-output name=result::{result_json}") diff --git a/.github/workflows/arduino-stats.yml b/.github/workflows/arduino-stats.yml deleted file mode 100644 index 167ab57a2..000000000 --- a/.github/workflows/arduino-stats.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: arduino-stats - -on: - schedule: - # run every day at 07:00 AM, 03:00 PM and 11:00 PM - - cron: "0 7,15,23 * * *" - workflow_dispatch: - repository_dispatch: - -jobs: - push-stats: - # This workflow is only of value to the arduino/arduino-ide repository and - # would always fail in forks - if: github.repository == 'arduino/arduino-ide' - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v2 - - - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - - name: Fetch downloads count form Arduino CDN using AWS Athena - id: fetch - env: - AWS_ACCESS_KEY_ID: ${{ secrets.STATS_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.STATS_AWS_SECRET_ACCESS_KEY }} - AWS_ATHENA_SOURCE_TABLE: ${{ secrets.STATS_AWS_ATHENA_SOURCE_TABLE }} - AWS_ATHENA_OUTPUT_LOCATION: ${{ secrets.STATS_AWS_ATHENA_OUTPUT_LOCATION }} - GITHUB_REPOSITORY: ${{ github.repository }} - run: | - pip install boto3 semver - python .github/tools/fetch_athena_stats.py - - - name: Send metrics - uses: masci/datadog@v1 - with: - api-key: ${{ secrets.DD_API_KEY }} - # Metrics input expects YAML but JSON will work just right. - metrics: ${{steps.fetch.outputs.result}} - - - name: Report failure - if: failure() - uses: masci/datadog@v1 - with: - api-key: ${{ secrets.DD_API_KEY }} - events: | - - title: "Arduino IDE stats failing" - text: "Stats collection failed" - alert_type: "error" - host: ${{ github.repository }} - tags: - - "project:arduino-ide" - - "cdn:downloads.arduino.cc" - - "workflow:${{ github.workflow }}" From 378b1819057d199ff5933ec30257d15ee581b652 Mon Sep 17 00:00:00 2001 From: per1234 Date: Sun, 30 Oct 2022 16:39:33 -0700 Subject: [PATCH 2/2] Remove unused GitHub release download stats workflow The "github-stats" GitHub Actions workflow periodically gathers GitHub release asset download statistics for Arduino CLI and pushes the results to Datadog. There are no known problems with this workflow. However, the companion "arduino-stats" workflow that did the same for the downloads of Arduino IDE from downloads.arduino.cc was broken and thus removed from the repository. The GitHub stats are not very valuable on their own as they only provide an unknown fraction of the total downloads of Arduino IDE. They have also not ended up being used. The workflow also uses deprecated Node.js 12 runtime, which currently results in warnings printed to the workflow run summary page, but will eventually cause the complete breakage of the workflow. Since it doesn't provide any value and represents a maintenance burden, the workflow is hereby removed from the repository. --- .github/workflows/github-stats.yml | 96 ------------------------------ 1 file changed, 96 deletions(-) delete mode 100644 .github/workflows/github-stats.yml diff --git a/.github/workflows/github-stats.yml b/.github/workflows/github-stats.yml deleted file mode 100644 index c7c8a98af..000000000 --- a/.github/workflows/github-stats.yml +++ /dev/null @@ -1,96 +0,0 @@ -name: github-stats - -on: - schedule: - # run every 30 minutes - - cron: "*/30 * * * *" - workflow_dispatch: - repository_dispatch: - -jobs: - push-stats: - # This workflow is only of value to the arduino/arduino-ide repository and - # would always fail in forks - if: github.repository == 'arduino/arduino-ide' - runs-on: ubuntu-latest - - steps: - - name: Fetch downloads count - id: fetch - uses: actions/github-script@v4 - with: - github-token: ${{github.token}} - script: | - let metrics = [] - - // Get a list of releases - const opts = github.repos.listReleases.endpoint.merge({ - ...context.repo - }) - const releases = await github.paginate(opts) - - // Get download stats for every release - for (const rel of releases) { - // Names for assets are like `arduino-ide_2.0.0-beta.12_Linux_64bit.zip`, - // we'll use this later to split the asset file name more easily - const baseName = `arduino-ide_${rel.name}_` - - // Get a list of assets for this release - const opts = github.repos.listReleaseAssets.endpoint.merge({ - ...context.repo, - release_id: rel.id - }) - const assets = await github.paginate(opts) - - for (const asset of assets) { - // Ignore files that are not arduino-ide packages - if (!asset.name.startsWith(baseName)) { - continue - } - - // Strip the base and remove file extension to get `Linux_32bit` - systemArch = asset.name.replace(baseName, "").split(".")[0].split("_") - - // Add a metric object to the list of gathered metrics - metrics.push({ - "type": "gauge", - "name": "arduino.downloads.total", - "value": asset.download_count, - "host": "${{ github.repository }}", - "tags": [ - `version:${rel.name}`, - `os:${systemArch[0]}`, - `arch:${systemArch[1]}`, - "cdn:github.com", - "project:arduino-ide" - ] - }) - } - } - - // The action will put whatever we return from this function in - // `outputs.result`, JSON encoded. So we just return the array - // of objects and GitHub will do the rest. - return metrics - - - name: Send metrics - uses: masci/datadog@v1 - with: - api-key: ${{ secrets.DD_API_KEY }} - # Metrics input expects YAML but JSON will work just right. - metrics: ${{steps.fetch.outputs.result}} - - - name: Report failure - if: failure() - uses: masci/datadog@v1 - with: - api-key: ${{ secrets.DD_API_KEY }} - events: | - - title: "Arduino IDE stats failing" - text: "Stats collection failed" - alert_type: "error" - host: ${{ github.repository }} - tags: - - "project:arduino-ide" - - "cdn:github.com" - - "workflow:${{ github.workflow }}"