Add a script to split minimal tests into distinct directories depending on metrics differences (mozilla#623)

Luni-4 · web-flow · commit 2531d02788b6 · 2021-05-11T08:57:03.000+02:00
* Add a script to split minimal tests

This script splits minimal tests into distinct directories depending on
their metric differences

* Use the new script in the metric-checker
diff --git a/check-grammars-crates.sh b/check-grammars-crates.sh
@@ -92,29 +92,12 @@ if [ "$(ls -A $COMPARE)" ]; then
     # Maximum number of considered minimal tests for a metric
     MT_THRESHOLD=30
 
-    # Array containing the considered metrics
-    # TODO: Implement a command into rust-code-analysis-cli that returns all
-    # computed metrics https://github.com/mozilla/rust-code-analysis/issues/478
-    METRICS=("cognitive" "sloc" "ploc" "lloc" "cloc" "blank" "cyclomatic" "halstead" "nom" "nexits" "nargs")
-
-    # Output directory name
+    # Output directory path
     OUTPUT_DIR=/tmp/output-$TREE_SITTER_CRATE
 
-    # Create output directory
-    mkdir -p $OUTPUT_DIR
-
-    # Retrieve minimal tests for a metric
-    for METRIC in "${METRICS[@]}"
-    do
-
-        PREFIX_METRIC="\.$METRIC"
-        FILES=`grep -r -i -l $PREFIX_METRIC $COMPARE | head -$MT_THRESHOLD`
-        if [ -n "$FILES" ]
-        then
-            mkdir -p $OUTPUT_DIR/$METRIC
-            cp $FILES $OUTPUT_DIR/$METRIC
-        fi
-    done
+    # Split files into distinct directories depending on
+    # their metric differences
+    ./split-minimal-tests.py -i $COMPARE -o $OUTPUT_DIR -t $MT_THRESHOLD
 
     tar -czvf /tmp/json-diffs-and-minimal-tests.tar.gz $COMPARE $OUTPUT_DIR
 fi
diff --git a/split-minimal-tests.py b/split-minimal-tests.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+"""split-minimal-tests
+This script splits HTML minimal-tests, produced by a software called
+`json-minimal-tests`, into distinct directories depending on metric differences.
+
+Usage:
+
+./split-minimal-tests.py -i INPUT_DIR -o OUTPUT_DIR [-t MT_THRESHOLD]
+
+NOTE: OUTPUT_DIR is the path to the output directory to be created.
+This directory could contain either a series of directories, called as
+the metrics that presents differences, or be empty if no metric differences
+are found.
+MT_THRESHOLD determines the maximum number of considered minimal tests
+for a metric.
+"""
+
+import argparse
+import pathlib
+import re
+import shutil
+import typing as T
+
+# List of metrics
+# TODO: Implement a command into rust-code-analysis-cli that returns all
+# computed metrics https://github.com/mozilla/rust-code-analysis/issues/478
+METRICS = [
+    "cognitive",
+    "sloc",
+    "ploc",
+    "lloc",
+    "cloc",
+    "blank",
+    "cyclomatic",
+    "halstead",
+    "nom",
+    "nexits",
+    "nargs",
+]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="split-minimal-tests",
+        description="This tool splits HTML minimal-tests, produced by "
+        "a software called `json-minimal-tests`, into distinct directories "
+        "depending on metric differences.",
+        epilog="The source code of this program can be found on "
+        "GitHub at https://github.com/mozilla/rust-code-analysis",
+    )
+
+    # Arguments
+    parser.add_argument(
+        "--input",
+        "-i",
+        type=lambda value: pathlib.Path(value),
+        required=True,
+        help="Input directory containing HTML minimal tests.",
+    )
+
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=lambda value: pathlib.Path(value),
+        required=True,
+        help="Path to the output directory.",
+    )
+
+    # Optional arguments
+    parser.add_argument(
+        "--threshold",
+        "-t",
+        type=int,
+        help="Maximum number of considered minimal tests for a metric.",
+    )
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Create output directory
+    args.output.mkdir(parents=True, exist_ok=True)
+
+    # Save files associated to each metric
+    metrics_saver: T.Dict[str, T.List] = {metric_name: [] for metric_name in METRICS}
+
+    # Iterate over the files contained in the input directory
+    for path in args.input.glob("*.html"):
+        # Open a file
+        with open(path) as f:
+            # Read a file
+            file_str = f.read()
+
+            # Remove all code inside <pre></pre> tags
+            file_no_pre = re.sub(r"<pre>(.|\n)*?<\/pre>", "", file_str)
+
+            # Iterate over metrics
+            for metric_name, metric_files in metrics_saver.items():
+                # Check if there is a metric difference in a file
+                m = re.search(f"(\.{metric_name})", file_no_pre)
+
+                # If some errors occurred, skip to the next metric
+                if m is None:
+                    continue
+
+                # Save path if there is a metric difference in a file
+                if m.group(1):
+                    metric_files.append(path)
+
+    # Iterate over metrics to print them
+    for metric_name, metric_files in metrics_saver.items():
+        # Create path for metric directory
+        metric_path = args.output / metric_name
+
+        if metric_files:
+            # Create metric directory
+            metric_path.mkdir(parents=True, exist_ok=True)
+
+            # Save the number of files specified in the threshold
+            output_paths = (
+                metric_files[: args.threshold] if args.threshold else metric_files
+            )
+
+            for path in output_paths:
+                # Copy files in the directory
+                shutil.copy(path, metric_path)
+
+
+if __name__ == "__main__":
+    main()