diff --git a/.github/scripts/check_notebooks.py b/.github/scripts/check_notebooks.py new file mode 100644 index 0000000000..0bcf540346 --- /dev/null +++ b/.github/scripts/check_notebooks.py @@ -0,0 +1,61 @@ +import subprocess +import sys +from pathlib import Path + +import nbformat + + +def get_changed_notebooks(base_ref: str = "origin/main") -> list[Path]: + """ + Returns a list of changed notebook paths in the current git branch + compared to the specified base reference. + """ + result = subprocess.run( + ["git", "diff", "--name-only", base_ref, "--", "*.ipynb"], + capture_output=True, + text=True, + check=True, + ) + return [Path(line.strip()) for line in result.stdout.splitlines() if line.strip()] + + +def is_valid_notebook(path: Path) -> bool: + """ + Checks if the notebook at the given path is valid by attempting to read it + with nbformat. + """ + try: + with open(path, "r", encoding="utf-8") as f: + nbformat.read(f, as_version=4) + return True + except Exception as e: + print(f"{path}: INVALID - {e}") + return False + + +def main() -> None: + """ + Main function to validate the format of changed notebooks. + """ + changed_notebooks = get_changed_notebooks() + if not changed_notebooks: + print("No changed .ipynb files to validate.") + sys.exit(0) + + print(f"Validating {len(changed_notebooks)} notebook(s)...") + errors = 0 + for path in changed_notebooks: + if not path.exists(): + continue # skip deleted files + if not is_valid_notebook(path): + errors += 1 + + if errors: + print(f"{errors} invalid notebook(s) found.") + sys.exit(1) + else: + print("All changed notebooks are valid.") + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/validate-notebooks.yaml b/.github/workflows/validate-notebooks.yaml new file mode 100644 index 0000000000..bf2e6326fa --- /dev/null +++ b/.github/workflows/validate-notebooks.yaml @@ -0,0 +1,25 @@ +name: Validate Changed Notebooks + +on: [pull_request] + +jobs: + validate-notebooks: + name: Validate Notebooks + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 0 # needed for git diff to work + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install dependencies + run: pip install nbformat + + - name: Validate changed .ipynb files + run: python .github/scripts/check_notebooks.py diff --git a/.gitignore b/.gitignore index 62c34036f3..8d39493570 100644 --- a/.gitignore +++ b/.gitignore @@ -140,3 +140,6 @@ examples/fine-tuned_qa/local_cache/* # PyCharm files .idea/ + +# VS Code files +.vscode/ diff --git a/examples/vector_databases/chroma/hyde-with-chroma-and-openai.ipynb b/examples/vector_databases/chroma/hyde-with-chroma-and-openai.ipynb index 23c350a563..1274618bfb 100644 --- a/examples/vector_databases/chroma/hyde-with-chroma-and-openai.ipynb +++ b/examples/vector_databases/chroma/hyde-with-chroma-and-openai.ipynb @@ -41,10 +41,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n", - "\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -236,7 +232,7 @@ "def build_prompt(claim):\n", " return [\n", " {\"role\": \"system\", \"content\": \"I will ask you to assess a scientific claim. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"},\n", - " {\"role\": \"user\", \"content\": f\"\"\" \n", + " {\"role\": \"user\", \"content\": f\"\"\"\n", "Example:\n", "\n", "Claim:\n", @@ -298,7 +294,7 @@ "# Let's take a look at 50 claims\n", "samples = claim_df.sample(50)\n", "\n", - "claims = samples['claim'].tolist() \n" + "claims = samples['claim'].tolist()\n" ] }, { @@ -317,7 +313,7 @@ "def get_groundtruth(evidence):\n", " groundtruth = []\n", " for e in evidence:\n", - " # Evidence is empty \n", + " # Evidence is empty\n", " if len(e) == 0:\n", " groundtruth.append('NEE')\n", " else:\n", @@ -392,17 +388,17 @@ "text": [ "\tGroundtruth\n", "\tTrue\tFalse\tNEE\n", - "True\t12\t4\t16\t\n", - "False\t0\t4\t3\t\n", - "NEE\t6\t2\t3\t\n" + "True\t9\t3\t15\t\n", + "False\t0\t3\t2\t\n", + "NEE\t8\t6\t4\t\n" ] }, { "data": { "text/plain": [ - "{'True': {'True': 12, 'False': 4, 'NEE': 16},\n", - " 'False': {'True': 0, 'False': 4, 'NEE': 3},\n", - " 'NEE': {'True': 6, 'False': 2, 'NEE': 3}}" + "{'True': {'True': 9, 'False': 3, 'NEE': 15},\n", + " 'False': {'True': 0, 'False': 3, 'NEE': 2},\n", + " 'NEE': {'True': 8, 'False': 6, 'NEE': 4}}" ] }, "execution_count": 10, @@ -631,13 +627,13 @@ "outputs": [], "source": [ "def build_prompt_with_context(claim, context):\n", - " return [{'role': 'system', 'content': \"I will ask you to assess whether a particular scientific claim, based on evidence provided. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"}, \n", + " return [{'role': 'system', 'content': \"I will ask you to assess whether a particular scientific claim, based on evidence provided. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"},\n", " {'role': 'user', 'content': f\"\"\"\"\n", "The evidence is the following:\n", "\n", "{' '.join(context)}\n", "\n", - "Assess the following claim on the basis of the evidence. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence. Do not output any other text. \n", + "Assess the following claim on the basis of the evidence. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence. Do not output any other text.\n", "\n", "Claim:\n", "{claim}\n", @@ -683,17 +679,17 @@ "text": [ "\tGroundtruth\n", "\tTrue\tFalse\tNEE\n", - "True\t13\t0\t3\t\n", - "False\t0\t9\t3\t\n", - "NEE\t5\t1\t16\t\n" + "True\t13\t1\t4\t\n", + "False\t1\t10\t2\t\n", + "NEE\t3\t1\t15\t\n" ] }, { "data": { "text/plain": [ - "{'True': {'True': 13, 'False': 0, 'NEE': 3},\n", - " 'False': {'True': 0, 'False': 9, 'NEE': 3},\n", - " 'NEE': {'True': 5, 'False': 1, 'NEE': 16}}" + "{'True': {'True': 13, 'False': 1, 'NEE': 4},\n", + " 'False': {'True': 1, 'False': 10, 'NEE': 2},\n", + " 'NEE': {'True': 3, 'False': 1, 'NEE': 15}}" ] }, "execution_count": 16, @@ -774,17 +770,17 @@ "text": [ "\tGroundtruth\n", "\tTrue\tFalse\tNEE\n", - "True\t6\t0\t3\t\n", - "False\t0\t3\t0\t\n", - "NEE\t12\t7\t19\t\n" + "True\t9\t0\t1\t\n", + "False\t0\t7\t0\t\n", + "NEE\t8\t5\t20\t\n" ] }, { "data": { "text/plain": [ - "{'True': {'True': 6, 'False': 0, 'NEE': 3},\n", - " 'False': {'True': 0, 'False': 3, 'NEE': 0},\n", - " 'NEE': {'True': 12, 'False': 7, 'NEE': 19}}" + "{'True': {'True': 9, 'False': 0, 'NEE': 1},\n", + " 'False': {'True': 0, 'False': 7, 'NEE': 0},\n", + " 'NEE': {'True': 8, 'False': 5, 'NEE': 20}}" ] }, "execution_count": 19, @@ -843,19 +839,19 @@ "source": [ "def build_hallucination_prompt(claim):\n", " return [{'role': 'system', 'content': \"\"\"I will ask you to write an abstract for a scientific paper which supports or refutes a given claim. It should be written in scientific language, include a title. Output only one abstract, then stop.\n", - " \n", + "\n", " An Example:\n", "\n", " Claim:\n", " A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.\n", "\n", " Abstract:\n", - " BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia homozygosity confers considerable protection against severe malaria, including severe malarial anaemia (SMA) (Hb concentration < 50 g/l), but does not influence parasite count. We tested the hypothesis that the erythrocyte indices associated with alpha(+)-thalassaemia homozygosity provide a haematological benefit during acute malaria. \n", - " METHODS AND FINDINGS Data from children living on the north coast of Papua New Guinea who had participated in a case-control study of the protection afforded by alpha(+)-thalassaemia against severe malaria were reanalysed to assess the genotype-specific reduction in erythrocyte count and Hb levels associated with acute malarial disease. We observed a reduction in median erythrocyte count of approximately 1.5 x 10(12)/l in all children with acute falciparum malaria relative to values in community children (p < 0.001). We developed a simple mathematical model of the linear relationship between Hb concentration and erythrocyte count. This model predicted that children homozygous for alpha(+)-thalassaemia lose less Hb than children of normal genotype for a reduction in erythrocyte count of >1.1 x 10(12)/l as a result of the reduced mean cell Hb in homozygous alpha(+)-thalassaemia. In addition, children homozygous for alpha(+)-thalassaemia require a 10% greater reduction in erythrocyte count than children of normal genotype (p = 0.02) for Hb concentration to fall to 50 g/l, the cutoff for SMA. We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09). \n", + " BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia homozygosity confers considerable protection against severe malaria, including severe malarial anaemia (SMA) (Hb concentration < 50 g/l), but does not influence parasite count. We tested the hypothesis that the erythrocyte indices associated with alpha(+)-thalassaemia homozygosity provide a haematological benefit during acute malaria.\n", + " METHODS AND FINDINGS Data from children living on the north coast of Papua New Guinea who had participated in a case-control study of the protection afforded by alpha(+)-thalassaemia against severe malaria were reanalysed to assess the genotype-specific reduction in erythrocyte count and Hb levels associated with acute malarial disease. We observed a reduction in median erythrocyte count of approximately 1.5 x 10(12)/l in all children with acute falciparum malaria relative to values in community children (p < 0.001). We developed a simple mathematical model of the linear relationship between Hb concentration and erythrocyte count. This model predicted that children homozygous for alpha(+)-thalassaemia lose less Hb than children of normal genotype for a reduction in erythrocyte count of >1.1 x 10(12)/l as a result of the reduced mean cell Hb in homozygous alpha(+)-thalassaemia. In addition, children homozygous for alpha(+)-thalassaemia require a 10% greater reduction in erythrocyte count than children of normal genotype (p = 0.02) for Hb concentration to fall to 50 g/l, the cutoff for SMA. We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09).\n", " CONCLUSIONS The increased erythrocyte count and microcytosis in children homozygous for alpha(+)-thalassaemia may contribute substantially to their protection against SMA. A lower concentration of Hb per erythrocyte and a larger population of erythrocytes may be a biologically advantageous strategy against the significant reduction in erythrocyte count that occurs during acute infection with the malaria parasite Plasmodium falciparum. This haematological profile may reduce the risk of anaemia by other Plasmodium species, as well as other causes of anaemia. Other host polymorphisms that induce an increased erythrocyte count and microcytosis may confer a similar advantage.\n", "\n", - " End of example. \n", - " \n", + " End of example.\n", + "\n", " \"\"\"}, {'role': 'user', 'content': f\"\"\"\"\n", " Perform the task for the following claim.\n", "\n", @@ -931,17 +927,17 @@ "text": [ "\tGroundtruth\n", "\tTrue\tFalse\tNEE\n", - "True\t11\t0\t5\t\n", - "False\t0\t8\t1\t\n", - "NEE\t7\t2\t16\t\n" + "True\t13\t0\t3\t\n", + "False\t1\t10\t1\t\n", + "NEE\t3\t2\t17\t\n" ] }, { "data": { "text/plain": [ - "{'True': {'True': 11, 'False': 0, 'NEE': 5},\n", - " 'False': {'True': 0, 'False': 8, 'NEE': 1},\n", - " 'NEE': {'True': 7, 'False': 2, 'NEE': 16}}" + "{'True': {'True': 13, 'False': 0, 'NEE': 3},\n", + " 'False': {'True': 1, 'False': 10, 'NEE': 1},\n", + " 'NEE': {'True': 3, 'False': 2, 'NEE': 17}}" ] }, "execution_count": 23, diff --git a/registry.yaml b/registry.yaml index 37fe4657ca..c10545c552 100644 --- a/registry.yaml +++ b/registry.yaml @@ -694,7 +694,7 @@ date: 2025-04-23 authors: - atroyn - - brandonbaker + - brandonbaker-openai tags: - embeddings - completions