Skip to content

Add PR action to validate notebook format #1793

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions .github/scripts/check_notebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import subprocess
import sys
from pathlib import Path

import nbformat


def get_changed_notebooks(base_ref: str = "origin/main") -> list[Path]:
"""
Returns a list of changed notebook paths in the current git branch
compared to the specified base reference.
"""
result = subprocess.run(
["git", "diff", "--name-only", base_ref, "--", "*.ipynb"],
capture_output=True,
text=True,
check=True,
)
return [Path(line.strip()) for line in result.stdout.splitlines() if line.strip()]


def is_valid_notebook(path: Path) -> bool:
"""
Checks if the notebook at the given path is valid by attempting to read it
with nbformat.
"""
try:
with open(path, "r", encoding="utf-8") as f:
nbformat.read(f, as_version=4)
return True
except Exception as e:
print(f"{path}: INVALID - {e}")
return False


def main() -> None:
"""
Main function to validate the format of changed notebooks.
"""
changed_notebooks = get_changed_notebooks()
if not changed_notebooks:
print("No changed .ipynb files to validate.")
sys.exit(0)

print(f"Validating {len(changed_notebooks)} notebook(s)...")
errors = 0
for path in changed_notebooks:
if not path.exists():
continue # skip deleted files
if not is_valid_notebook(path):
errors += 1

if errors:
print(f"{errors} invalid notebook(s) found.")
sys.exit(1)
else:
print("All changed notebooks are valid.")


if __name__ == "__main__":
main()
25 changes: 25 additions & 0 deletions .github/workflows/validate-notebooks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Validate Changed Notebooks

on: [pull_request]

jobs:
validate-notebooks:
name: Validate Notebooks
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v3
with:
fetch-depth: 0 # needed for git diff to work

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.12'

- name: Install dependencies
run: pip install nbformat

- name: Validate changed .ipynb files
run: python .github/scripts/check_notebooks.py
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,6 @@ examples/fine-tuned_qa/local_cache/*

# PyCharm files
.idea/

# VS Code files
.vscode/
72 changes: 34 additions & 38 deletions examples/vector_databases/chroma/hyde-with-chroma-and-openai.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063\u001b[0m\u001b[33m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
Expand Down Expand Up @@ -236,7 +232,7 @@
"def build_prompt(claim):\n",
" return [\n",
" {\"role\": \"system\", \"content\": \"I will ask you to assess a scientific claim. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"},\n",
" {\"role\": \"user\", \"content\": f\"\"\" \n",
" {\"role\": \"user\", \"content\": f\"\"\"\n",
"Example:\n",
"\n",
"Claim:\n",
Expand Down Expand Up @@ -298,7 +294,7 @@
"# Let's take a look at 50 claims\n",
"samples = claim_df.sample(50)\n",
"\n",
"claims = samples['claim'].tolist() \n"
"claims = samples['claim'].tolist()\n"
]
},
{
Expand All @@ -317,7 +313,7 @@
"def get_groundtruth(evidence):\n",
" groundtruth = []\n",
" for e in evidence:\n",
" # Evidence is empty \n",
" # Evidence is empty\n",
" if len(e) == 0:\n",
" groundtruth.append('NEE')\n",
" else:\n",
Expand Down Expand Up @@ -392,17 +388,17 @@
"text": [
"\tGroundtruth\n",
"\tTrue\tFalse\tNEE\n",
"True\t12\t4\t16\t\n",
"False\t0\t4\t3\t\n",
"NEE\t6\t2\t3\t\n"
"True\t9\t3\t15\t\n",
"False\t0\t3\t2\t\n",
"NEE\t8\t6\t4\t\n"
]
},
{
"data": {
"text/plain": [
"{'True': {'True': 12, 'False': 4, 'NEE': 16},\n",
" 'False': {'True': 0, 'False': 4, 'NEE': 3},\n",
" 'NEE': {'True': 6, 'False': 2, 'NEE': 3}}"
"{'True': {'True': 9, 'False': 3, 'NEE': 15},\n",
" 'False': {'True': 0, 'False': 3, 'NEE': 2},\n",
" 'NEE': {'True': 8, 'False': 6, 'NEE': 4}}"
]
},
"execution_count": 10,
Expand Down Expand Up @@ -631,13 +627,13 @@
"outputs": [],
"source": [
"def build_prompt_with_context(claim, context):\n",
" return [{'role': 'system', 'content': \"I will ask you to assess whether a particular scientific claim, based on evidence provided. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"}, \n",
" return [{'role': 'system', 'content': \"I will ask you to assess whether a particular scientific claim, based on evidence provided. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence.\"},\n",
" {'role': 'user', 'content': f\"\"\"\"\n",
"The evidence is the following:\n",
"\n",
"{' '.join(context)}\n",
"\n",
"Assess the following claim on the basis of the evidence. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence. Do not output any other text. \n",
"Assess the following claim on the basis of the evidence. Output only the text 'True' if the claim is true, 'False' if the claim is false, or 'NEE' if there's not enough evidence. Do not output any other text.\n",
"\n",
"Claim:\n",
"{claim}\n",
Expand Down Expand Up @@ -683,17 +679,17 @@
"text": [
"\tGroundtruth\n",
"\tTrue\tFalse\tNEE\n",
"True\t13\t0\t3\t\n",
"False\t0\t9\t3\t\n",
"NEE\t5\t1\t16\t\n"
"True\t13\t1\t4\t\n",
"False\t1\t10\t2\t\n",
"NEE\t3\t1\t15\t\n"
]
},
{
"data": {
"text/plain": [
"{'True': {'True': 13, 'False': 0, 'NEE': 3},\n",
" 'False': {'True': 0, 'False': 9, 'NEE': 3},\n",
" 'NEE': {'True': 5, 'False': 1, 'NEE': 16}}"
"{'True': {'True': 13, 'False': 1, 'NEE': 4},\n",
" 'False': {'True': 1, 'False': 10, 'NEE': 2},\n",
" 'NEE': {'True': 3, 'False': 1, 'NEE': 15}}"
]
},
"execution_count": 16,
Expand Down Expand Up @@ -774,17 +770,17 @@
"text": [
"\tGroundtruth\n",
"\tTrue\tFalse\tNEE\n",
"True\t6\t0\t3\t\n",
"False\t0\t3\t0\t\n",
"NEE\t12\t7\t19\t\n"
"True\t9\t0\t1\t\n",
"False\t0\t7\t0\t\n",
"NEE\t8\t5\t20\t\n"
]
},
{
"data": {
"text/plain": [
"{'True': {'True': 6, 'False': 0, 'NEE': 3},\n",
" 'False': {'True': 0, 'False': 3, 'NEE': 0},\n",
" 'NEE': {'True': 12, 'False': 7, 'NEE': 19}}"
"{'True': {'True': 9, 'False': 0, 'NEE': 1},\n",
" 'False': {'True': 0, 'False': 7, 'NEE': 0},\n",
" 'NEE': {'True': 8, 'False': 5, 'NEE': 20}}"
]
},
"execution_count": 19,
Expand Down Expand Up @@ -843,19 +839,19 @@
"source": [
"def build_hallucination_prompt(claim):\n",
" return [{'role': 'system', 'content': \"\"\"I will ask you to write an abstract for a scientific paper which supports or refutes a given claim. It should be written in scientific language, include a title. Output only one abstract, then stop.\n",
" \n",
"\n",
" An Example:\n",
"\n",
" Claim:\n",
" A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.\n",
"\n",
" Abstract:\n",
" BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia homozygosity confers considerable protection against severe malaria, including severe malarial anaemia (SMA) (Hb concentration < 50 g/l), but does not influence parasite count. We tested the hypothesis that the erythrocyte indices associated with alpha(+)-thalassaemia homozygosity provide a haematological benefit during acute malaria. \n",
" METHODS AND FINDINGS Data from children living on the north coast of Papua New Guinea who had participated in a case-control study of the protection afforded by alpha(+)-thalassaemia against severe malaria were reanalysed to assess the genotype-specific reduction in erythrocyte count and Hb levels associated with acute malarial disease. We observed a reduction in median erythrocyte count of approximately 1.5 x 10(12)/l in all children with acute falciparum malaria relative to values in community children (p < 0.001). We developed a simple mathematical model of the linear relationship between Hb concentration and erythrocyte count. This model predicted that children homozygous for alpha(+)-thalassaemia lose less Hb than children of normal genotype for a reduction in erythrocyte count of >1.1 x 10(12)/l as a result of the reduced mean cell Hb in homozygous alpha(+)-thalassaemia. In addition, children homozygous for alpha(+)-thalassaemia require a 10% greater reduction in erythrocyte count than children of normal genotype (p = 0.02) for Hb concentration to fall to 50 g/l, the cutoff for SMA. We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09). \n",
" BACKGROUND The heritable haemoglobinopathy alpha(+)-thalassaemia is caused by the reduced synthesis of alpha-globin chains that form part of normal adult haemoglobin (Hb). Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count. Alpha(+)-thalassaemia homozygosity confers considerable protection against severe malaria, including severe malarial anaemia (SMA) (Hb concentration < 50 g/l), but does not influence parasite count. We tested the hypothesis that the erythrocyte indices associated with alpha(+)-thalassaemia homozygosity provide a haematological benefit during acute malaria.\n",
" METHODS AND FINDINGS Data from children living on the north coast of Papua New Guinea who had participated in a case-control study of the protection afforded by alpha(+)-thalassaemia against severe malaria were reanalysed to assess the genotype-specific reduction in erythrocyte count and Hb levels associated with acute malarial disease. We observed a reduction in median erythrocyte count of approximately 1.5 x 10(12)/l in all children with acute falciparum malaria relative to values in community children (p < 0.001). We developed a simple mathematical model of the linear relationship between Hb concentration and erythrocyte count. This model predicted that children homozygous for alpha(+)-thalassaemia lose less Hb than children of normal genotype for a reduction in erythrocyte count of >1.1 x 10(12)/l as a result of the reduced mean cell Hb in homozygous alpha(+)-thalassaemia. In addition, children homozygous for alpha(+)-thalassaemia require a 10% greater reduction in erythrocyte count than children of normal genotype (p = 0.02) for Hb concentration to fall to 50 g/l, the cutoff for SMA. We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09).\n",
" CONCLUSIONS The increased erythrocyte count and microcytosis in children homozygous for alpha(+)-thalassaemia may contribute substantially to their protection against SMA. A lower concentration of Hb per erythrocyte and a larger population of erythrocytes may be a biologically advantageous strategy against the significant reduction in erythrocyte count that occurs during acute infection with the malaria parasite Plasmodium falciparum. This haematological profile may reduce the risk of anaemia by other Plasmodium species, as well as other causes of anaemia. Other host polymorphisms that induce an increased erythrocyte count and microcytosis may confer a similar advantage.\n",
"\n",
" End of example. \n",
" \n",
" End of example.\n",
"\n",
" \"\"\"}, {'role': 'user', 'content': f\"\"\"\"\n",
" Perform the task for the following claim.\n",
"\n",
Expand Down Expand Up @@ -931,17 +927,17 @@
"text": [
"\tGroundtruth\n",
"\tTrue\tFalse\tNEE\n",
"True\t11\t0\t5\t\n",
"False\t0\t8\t1\t\n",
"NEE\t7\t2\t16\t\n"
"True\t13\t0\t3\t\n",
"False\t1\t10\t1\t\n",
"NEE\t3\t2\t17\t\n"
]
},
{
"data": {
"text/plain": [
"{'True': {'True': 11, 'False': 0, 'NEE': 5},\n",
" 'False': {'True': 0, 'False': 8, 'NEE': 1},\n",
" 'NEE': {'True': 7, 'False': 2, 'NEE': 16}}"
"{'True': {'True': 13, 'False': 0, 'NEE': 3},\n",
" 'False': {'True': 1, 'False': 10, 'NEE': 1},\n",
" 'NEE': {'True': 3, 'False': 2, 'NEE': 17}}"
]
},
"execution_count": 23,
Expand Down
2 changes: 1 addition & 1 deletion registry.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@
date: 2025-04-23
authors:
- atroyn
- brandonbaker
- brandonbaker-openai
tags:
- embeddings
- completions
Expand Down