Merge pull request #53 from grace-sng7/creating_augmented_suggester

grace-sng7 · web-flow · commit d31fc791281a · 2025-05-26T21:17:22.000-05:00
diff --git a/README.md b/README.md
@@ -49,11 +49,22 @@ domain_expertises = modeler.suggest_domain_expertises(all_factors)
 # Suggest a set of potential confounders
 suggested_confounders = modeler.suggest_confounders(treatment, outcome, all_factors, domain_expertises)
 
-# Suggest pair-wise relationship between variables
+# Suggest pair-wise relationships between variables
 suggested_dag = modeler.suggest_relationships(treatment, outcome, all_factors, domain_expertises, RelationshipStrategy.Pairwise)
 ```
 
+### Retrieval Augmented Generation (RAG)-based Modeler
 
+```python
+# Create instance of Modeler
+modeler = ModelSuggester('gpt-4')
+
+treatment = "smoking"
+outcome = "lung cancer"
+
+# Suggest pair-wise relationship between two given variables, utilizing CauseNet for RAG
+suggested_relationship = modeler.suggest_relationships(treatment, outcome)
+```
 
 ### Identifier
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,6 +55,14 @@ networkx = "<=3.2.1"
 guidance = ">=0.2"
 openai = ">=1.70"
 pydantic = ">=2.11"
+langchain = ">=0.3.25"
+langchain-chroma = ">=0.2.4"
+langchain-community = ">=0.3.24"
+langchain-core = ">=0.3.60"
+langchain-huggingface = ">=0.2.0"
+langchain-openai = ">=0.3.17"
+rank-bm25 = ">=0.2.2"
+sentence-transformers = ">=4.1.0"
 
 [tool.poetry.group.dev.dependencies]
 poethepoet = "^0.33.0"
@@ -110,7 +118,7 @@ _isort_check = 'isort --check .'
 
 # testing tasks
 test = "pytest -v -m 'not advanced' --durations=0 --durations-min=60.0"
-test_no_notebooks= "pytest -v -m 'not advanced and not notebook' --durations=0 --durations-min=60.0"
+test_no_notebooks = "pytest -v -m 'not advanced and not notebook' --durations=0 --durations-min=60.0"
 test_durations = "poetry run poe test --store-durations"
 test_advanced = "pytest -v"
 test_focused = "pytest -v -m 'focused'"
diff --git a/pywhyllm/suggesters/augmented_model_suggester.py b/pywhyllm/suggesters/augmented_model_suggester.py
@@ -0,0 +1,45 @@
+import logging
+import re
+
+from .simple_model_suggester import SimpleModelSuggester
+from pywhyllm.utils.data_loader import *
+from pywhyllm.utils.augmented_model_suggester_utils import *
+
+
+class AugmentedModelSuggester(SimpleModelSuggester):
+    def __init__(self, llm, file_path: str = 'data/causenet-precision.jsonl.bz2'):
+        super().__init__(llm)
+        self.file_path = file_path
+
+        logging.basicConfig(level=logging.INFO)
+        url = "https://groups.uni-paderborn.de/wdqa/causenet/causality-graphs/causenet-precision.jsonl.bz2"
+        success = download_causenet(url, file_path)
+
+        if success:
+            print(f"File downloaded to {file_path}")
+            json_data = load_causenet_json(file_path)
+            self.causenet_dict = create_causenet_dict(json_data)
+        else:
+            print("Download failed")
+
+    def suggest_pairwise_relationship(self, variable1: str, variable2: str):
+        result = find_top_match_in_causenet(self.causenet_dict, variable1, variable2)
+        if result:
+            source_text = get_source_text(result)
+            retriever = split_data_and_create_vectorstore_retriever(source_text)
+            response = query_llm(variable1, variable2, source_text, retriever)
+        else:
+            response = query_llm(variable1, variable2)
+
+        answer = re.findall(r'<answer>(.*?)</answer>', response)
+        answer = [ans.strip() for ans in answer]
+        answer_str = "".join(answer)
+
+        if answer_str == "A":
+            return [variable1, variable2, response]
+        elif answer_str == "B":
+            return [variable2, variable1, response]
+        elif answer_str == "C":
+            return [None, None, response]
+        else:
+            assert False, "Invalid answer from LLM: " + answer_str
diff --git a/pywhyllm/suggesters/simple_model_suggester.py b/pywhyllm/suggesters/simple_model_suggester.py
@@ -55,11 +55,11 @@ def suggest_pairwise_relationship(self, variable1: str, variable2: str):
         answer = [ans.strip() for ans in answer]
         answer_str = "".join(answer)
 
-        if (answer_str == "A"):
+        if answer_str == "A":
             return [variable1, variable2, description]
-        elif (answer_str == "B"):
+        elif answer_str == "B":
             return [variable2, variable1, description]
-        elif (answer_str == "C"):
+        elif answer_str == "C":
             return [None, None, description]  # maybe we want to save the description in this case too
         else:
             assert False, "Invalid answer from LLM: " + answer_str
diff --git a/pywhyllm/utils/__init__.py b/pywhyllm/utils/__init__.py
diff --git a/pywhyllm/utils/augmented_model_suggester_utils.py b/pywhyllm/utils/augmented_model_suggester_utils.py
@@ -0,0 +1,143 @@
+import os
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+import numpy as np
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer, util
+
+
+def find_top_match_in_causenet(causenet_dict, variable1, variable2, threshold=0.7):
+    # Sample dictionary
+    pair_strings = [
+        f"{causenet_dict[key]['causal_relation']['cause']}-{causenet_dict[key]['causal_relation']['effect']}"
+        for key in causenet_dict]
+
+    # Tokenize for BM25
+    tokenized_pairs = [text.split() for text in pair_strings]
+    bm25 = BM25Okapi(tokenized_pairs)
+
+    # Original and reverse queries
+    query = variable1 + "-" + variable2
+    reverse_query = variable2 + "-" + variable1
+    tokenized_query = query.split()
+    tokenized_reverse_query = reverse_query.split()
+
+    # Combine tokens from both queries (remove duplicates)
+    combined_query = list(set(tokenized_query + tokenized_reverse_query))
+
+    # Get top-k candidates using BM25 with combined query
+    k = 5
+    scores = bm25.get_scores(combined_query)
+    top_k_indices = np.argsort(scores)[::-1][:k]
+    candidate_pairs = [pair_strings[i] for i in top_k_indices]
+
+    # Apply SBERT to candidates
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    query_embedding = model.encode(query, convert_to_tensor=True)
+    reverse_query_embedding = model.encode(reverse_query, convert_to_tensor=True)
+    candidate_embeddings = model.encode(candidate_pairs, convert_to_tensor=True)
+
+    # Compute similarities for both original and reverse queries
+    similarities = util.cos_sim(query_embedding, candidate_embeddings).flatten()
+    reverse_similarities = util.cos_sim(reverse_query_embedding, candidate_embeddings).flatten()
+
+    # Take the maximum similarity for each candidate (original or reverse)
+    max_similarities = np.maximum(similarities, reverse_similarities)
+
+    # Get the top match and its similarity score
+    top_idx = np.argmax(max_similarities)
+    top_similarity = max_similarities[top_idx]
+    top_pair = candidate_pairs[top_idx]
+
+    # Check if the top similarity meets the threshold
+    if top_similarity >= threshold:
+        print(f"Best match: {top_pair} (Similarity: {top_similarity:.4f})")
+        return causenet_dict[top_pair]
+    else:
+        print(f"No match found with similarity above {threshold} (Best similarity: {top_similarity:.4f})")
+        return None
+
+
+def get_source_text(causenet_query_result):
+    source_text = ""
+    if causenet_query_result:
+        for item in causenet_query_result["sources"]:
+            if item["type"] == 'wikipedia_sentence' or item["type"] == 'clueweb12_sentence':
+                source_text += item["payload"]["sentence"] + " "
+
+    return source_text
+
+
+def split_data_and_create_vectorstore_retriever(source_text):
+    document = Document(page_content=source_text)
+
+    # Initialize the text splitter
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=100,  # Adjust chunk size as needed
+        chunk_overlap=20  # Overlap for context
+    )
+    # Split the documents
+    splits = text_splitter.split_documents([document])
+
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+
+    # Create a vector store from the document splits
+    vectorstore = Chroma.from_documents(
+        documents=splits,
+        embedding=embeddings,
+        persist_directory="./chroma_db"  # Optional: Save to disk for reuse
+    )
+
+    # Create a retriever from the vector store
+    retriever = vectorstore.as_retriever(
+        search_type="similarity",
+        search_kwargs={"k": 5}  # Retrieve top 5 relevant chunks
+    )
+
+    return retriever
+
+
+def query_llm(variable1, variable2, source_text=None, retriever=None):
+    # Initialize the language model
+    llm = ChatOpenAI(model="gpt-4")
+
+    if source_text:
+        system_prompt = """You are a helpful assistant for causal reasoning.
+
+    Context: {context}
+    """
+    else:
+        system_prompt = """You are a helpful assistant for causal reasoning.
+    """
+
+    # prompt template
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", system_prompt),
+        ("human", "{input}")
+    ])
+
+    query = f"""Which cause-and-effect-relationship is more likely? Provide reasoning and you must give your final answer (A, B, or C) in <answer> </answer> tags with the letter only.
+            A. {variable1} causes {variable2} B. {variable2} causes {variable1} C. neither {variable1} nor {variable2} cause each other."""
+
+    # Define the system prompt
+    if source_text:
+        # Create a document chain to combine retrieved documents
+        question_answer_chain = create_stuff_documents_chain(llm, prompt)
+
+        # Create the RAG chain
+        rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+
+        response = rag_chain.invoke({"input": query})
+        return response['answer']
+
+
+    else:
+        default_chain = prompt | llm
+        response = default_chain.invoke({"input": query})
+        return response.content
diff --git a/pywhyllm/utils/data_loader.py b/pywhyllm/utils/data_loader.py
@@ -0,0 +1,103 @@
+import bz2
+import json
+import os
+import requests
+from tqdm import tqdm
+import logging
+
+
+def download_causenet(url: str, file_path: str) -> bool:
+    """
+    Download the CauseNet-Precision dataset from causenet.org and save it to the specified path.
+
+    The CauseNet-Precision dataset is a subset of the CauseNet
+    knowledge base containing high-precision causal relations extracted from web sources.
+    The dataset is described in the following paper:
+
+    Citation:
+    Stefan Heindorf, Yan Scholten, Henning Wachsmuth, Axel-Cyrille Ngonga Ngomo, and Martin Potthast.
+    2020. CauseNet: Towards a Causality Graph Extracted from the Web. In Proceedings of the 29th ACM
+    International Conference on Information &amp; Knowledge Management (CIKM '20). Association for
+    Computing Machinery, New York, NY, USA, 3023–3030. https://doi.org/10.1145/3340531.3412763
+
+    TODO: Add license
+
+    Args:
+        url (str): The URL of the file to download.
+        file_path (str): The local path where the file will be saved.
+
+    Returns:
+        bool: True if the download was successful, False otherwise.
+    """
+    try:
+        # Ensure the output directory exists
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+        # Send a GET request to the URL
+        response = requests.get(url, stream=True)
+
+        # Check if the request was successful
+        if response.status_code != 200:
+            logging.error(f"Failed to download file from {url}. Status code: {response.status_code}")
+            return False
+
+        # Get the total file size for progress bar (if available)
+        total_size = int(response.headers.get("content-length", 0))
+
+        # Download and save the file with a progress bar
+        with open(file_path, "wb") as file, tqdm(
+                desc="Downloading",
+                total=total_size,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+        ) as progress_bar:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    file.write(chunk)
+                    progress_bar.update(len(chunk))
+
+        logging.info(f"File downloaded successfully to {file_path}")
+        return True
+
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error downloading file from {url}: {e}")
+        return False
+    except OSError as e:
+        logging.error(f"Error saving file to {file_path}: {e}")
+        return False
+
+
+def load_causenet_json(file_path):
+    json_data = []
+    print("Loading CauseNet using json")
+    with bz2.open(file_path, 'rt',
+                  encoding='utf-8') as file:
+        # Read each line and parse as JSON
+        for line in file:
+            line = line.strip()  # Remove trailing newlines
+            if line:  # Skip empty lines
+                json_obj = json.loads(line)  # Parse the line as JSON
+                json_data.append(json_obj)  # Add to list
+    print("Done loading CauseNet using json")
+    return json_data
+
+
+def create_causenet_dict(json_data):
+    causenet_dict = {}
+    print("Creating dictionary from CauseNet json data")
+    for item in json_data:
+        cause = item['causal_relation']['cause']['concept']
+        effect = item['causal_relation']['effect']['concept']
+        key = cause + "-" + effect
+
+        if key not in causenet_dict:
+            causenet_dict[key] = {
+                'causal_relation': {'cause': cause, 'effect': effect},
+                'sources': item['sources']
+            }
+        else:
+            # Append sources to existing list
+            causenet_dict[key]['sources'].extend(item['sources'])
+    print("Done creating dictionary from CauseNet json data")
+    return causenet_dict