Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions baselines/BDS/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Bottom-up Domain-specific Superintelligence: A Reliable Knowledge Graph is What We Need
https://arxiv.org/pdf/2507.13966
Empty file added baselines/BDS/__init__.py
Empty file.
161 changes: 161 additions & 0 deletions baselines/BDS/bds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import argparse
import asyncio
import json
import os
from dataclasses import dataclass
from typing import List

import networkx as nx
from dotenv import load_dotenv
from tqdm.asyncio import tqdm as tqdm_async

from graphgen.models import NetworkXStorage, OpenAIModel, Tokenizer
from graphgen.utils import create_event_loop

QA_GENERATION_PROMPT = """
Create an agriculture examination question for advanced agricultural students that tests the relationship between {src} and {tgt}. The relationship is: {path}. The question should:
1. Be in multiple choice format (4 options)
2. Require agriculture reasoning along the relationship
3. Include a brief farm or field scenario
4. Not directly mention the relationship in the question stem
5. Have one clearly correct answer
Format:
<Question>
[Farm or Field Scenario]
</Question>
<Options>
A. [Option]
B. [Option]
C. [Option]
D. [Option]
</Options>
<Answer>:
[Correct Option Letter]
</Answer>
"""


def _post_process(text: str) -> dict:
try:
q = text.split("<Question>")[1].split("</Question>")[0].strip()
opts = text.split("<Options>")[1].split("</Options>")[0].strip().splitlines()
opts = [o.strip() for o in opts if o.strip()]
ans = text.split("<Answer>:")[1].strip()[0].upper()
return {
"question": q,
"options": opts,
"answer": ord(ans) - ord("A"),
"raw": text,
}
except Exception as e: # pylint: disable=broad-except
print(f"Error in post-processing: {e}")
return {}


@dataclass
class BDS:
llm_client: OpenAIModel = None
max_concurrent: int = 1000

def generate(self, tasks: List[dict]) -> List[dict]:
loop = create_event_loop()
return loop.run_until_complete(self._async_generate(tasks))

async def _async_generate(self, tasks: List[dict]) -> List[dict]:
sem = asyncio.Semaphore(self.max_concurrent)

async def job(item):
async with sem:
path_str = " -> ".join([f"({h},{r},{t})" for h, r, t in item["path"]])
prompt = QA_GENERATION_PROMPT.format(
src=item["src"], tgt=item["tgt"], path=path_str
)
resp = await self.llm_client.generate_answer(prompt)
return _post_process(resp)

tasks = [job(it) for it in tasks]
results = []
for coro in tqdm_async(asyncio.as_completed(tasks), total=len(tasks)):
try:
if r := await coro:
results.append(r)
except Exception as e: # pylint: disable=broad-except
print("Error:", e)
return results


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--input_file",
help="GraphML input file path.",
default="resources/input_examples/graphml_demo.graphml",
type=str,
)
parser.add_argument(
"--output_file",
help="Output file path.",
default="cache/data/bds_qa.jsonl",
type=str,
)
args = parser.parse_args()

load_dotenv()

tokenizer_instance: Tokenizer = Tokenizer(
model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base")
)
llm_client = OpenAIModel(
model_name=os.getenv("SYNTHESIZER_MODEL"),
api_key=os.getenv("SYNTHESIZER_API_KEY"),
base_url=os.getenv("SYNTHESIZER_BASE_URL"),
tokenizer_instance=tokenizer_instance,
)
bds = BDS(llm_client=llm_client)

graph = NetworkXStorage.load_nx_graph(args.input_file)

MAX_PATH = 20000
all_paths = []

G = graph.to_directed() if not graph.is_directed() else graph
print(G)

source_nodes = [n for n in G.nodes if G.out_degree(n) > 0][:1000]

for src in source_nodes:
for path in nx.all_simple_paths(G, source=src, target=list(G.nodes), cutoff=3):
if len(path) == 4:
all_paths.append(path)
if len(all_paths) >= MAX_PATH:
break
if len(all_paths) >= MAX_PATH:
break
if len(all_paths) >= MAX_PATH:
break

print(f"Found {len(all_paths)} 4-node paths")

items = []
for path in all_paths:
path_edges = []
for i in range(len(path) - 1):
edge_data = G.get_edge_data(path[i], path[i + 1])
if edge_data is None:
edge_data = G.get_edge_data(path[i + 1], path[i])
if edge_data is None:
print(f"Warning: No edge data between {path[i]} and {path[i+1]}")
relation = "related_to"
else:
relation = edge_data.get("relation", "related_to")
path_edges.append((path[i], relation, path[i + 1]))
items.append({"src": path[0], "tgt": path[-1], "path": path_edges})

print(f"Prepared {len(items)} items for question generation")

qa_pairs = bds.generate(items)
print(f"Generated {len(qa_pairs)} QA pairs")

# Save results
with open(args.output_file, "w", encoding="utf-8") as f:
json.dump(qa_pairs, f, indent=4, ensure_ascii=False)
3 changes: 3 additions & 0 deletions baselines/EntiGraph/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# EntiGraph
https://arxiv.org/abs/2409.07431
https://github.com/zitongyang/synthetic_continued_pretraining
3 changes: 0 additions & 3 deletions baselines/EntiGraph/entigraph.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# https://arxiv.org/abs/2409.07431
# https://github.com/zitongyang/synthetic_continued_pretraining

import argparse
import asyncio
import json
Expand Down
2 changes: 2 additions & 0 deletions baselines/Genie/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Genie
https://arxiv.org/pdf/2401.14367
2 changes: 0 additions & 2 deletions baselines/Genie/genie.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# https://arxiv.org/pdf/2401.14367

import argparse
import asyncio
import json
Expand Down
3 changes: 3 additions & 0 deletions baselines/LongForm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# LongFrom
https://arxiv.org/pdf/2304.08460
https://github.com/akoksal/LongForm/tree/main
3 changes: 0 additions & 3 deletions baselines/LongForm/longform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# https://arxiv.org/pdf/2304.08460
# https://github.com/akoksal/LongForm/tree/main

import argparse
import asyncio
import json
Expand Down
2 changes: 2 additions & 0 deletions baselines/SELF-QA/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SELF-QA
https://arxiv.org/abs/2305.11952
2 changes: 0 additions & 2 deletions baselines/SELF-QA/self-qa.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# https://arxiv.org/abs/2305.11952

import argparse
import asyncio
import json
Expand Down
2 changes: 2 additions & 0 deletions baselines/Wrap/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Wrap
https://arxiv.org/abs/2401.16380
2 changes: 0 additions & 2 deletions baselines/Wrap/wrap.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# https://arxiv.org/abs/2401.16380

import argparse
import asyncio
import json
Expand Down
Loading