fix minor type issues; add type ignore to loosely typed files

luciaquirke · luciaquirke · commit d86b611b348e · 2025-03-04T23:49:55.000Z
diff --git a/delphi/__main__.py b/delphi/__main__.py
@@ -26,7 +26,7 @@
 from delphi.pipeline import Pipe, Pipeline, process_wrapper
 from delphi.scorers import DetectionScorer, FuzzingScorer
 from delphi.sparse_coders import load_hooks_sparse_coders, load_sparse_coders
-from delphi.utils import load_tokenized_data
+from delphi.utils import assert_type, load_tokenized_data
 
 
 def load_artifacts(run_cfg: RunConfig):
@@ -325,8 +325,11 @@ async def run(
     hookpoints, hookpoint_to_sparse_encode, model, transcode = load_artifacts(run_cfg)
     tokenizer = AutoTokenizer.from_pretrained(run_cfg.model, token=run_cfg.hf_token)
 
-    nrh = non_redundant_hookpoints(
-        hookpoint_to_sparse_encode, latents_path, "cache" in run_cfg.overwrite
+    nrh = assert_type(
+        dict,
+        non_redundant_hookpoints(
+            hookpoint_to_sparse_encode, latents_path, "cache" in run_cfg.overwrite
+        ),
     )
     if nrh:
         populate_cache(
@@ -340,8 +343,11 @@ async def run(
 
     del model, hookpoint_to_sparse_encode
     if run_cfg.constructor_cfg.non_activating_source == "neighbours":
-        nrh = non_redundant_hookpoints(
-            hookpoints, neighbours_path, "neighbours" in run_cfg.overwrite
+        nrh = assert_type(
+            list,
+            non_redundant_hookpoints(
+                hookpoints, neighbours_path, "neighbours" in run_cfg.overwrite
+            ),
         )
         if nrh:
             create_neighbours(
@@ -353,8 +359,11 @@ async def run(
     else:
         print("Skipping neighbour creation")
 
-    nrh = non_redundant_hookpoints(
-        hookpoints, scores_path, "scores" in run_cfg.overwrite
+    nrh = assert_type(
+        list,
+        non_redundant_hookpoints(
+            hookpoints, scores_path, "scores" in run_cfg.overwrite
+        ),
     )
     if nrh:
         await process_cache(
diff --git a/delphi/latents/latents.py b/delphi/latents/latents.py
@@ -134,7 +134,7 @@ class LatentRecord:
     train: list[ActivatingExample] = field(default_factory=list)
     """Training examples."""
 
-    test: list[ActivatingExample] = field(default_factory=list)
+    test: list[ActivatingExample] | list[list[Example]] = field(default_factory=list)
     """Test examples."""
 
     neighbours: list[Neighbour] = field(default_factory=list)
@@ -143,6 +143,9 @@ class LatentRecord:
     explanation: str = ""
     """Explanation of the latent."""
 
+    extra_examples: Optional[list[Example]] = None
+    """Extra examples to include in the record."""
+
     @property
     def max_activation(self) -> float:
         """
diff --git a/delphi/scorers/simulator/oai_autointerp/explanations/simulator.py b/delphi/scorers/simulator/oai_autointerp/explanations/simulator.py
@@ -107,8 +107,8 @@ def parse_top_logprobs(top_logprobs: dict[str, float]) -> OrderedDict[int, float
     """
     probabilities_by_distribution_value = OrderedDict()
     for token, contents in top_logprobs.items():
-        logprob = contents.logprob
-        decoded_token = contents.decoded_token
+        logprob = contents.logprob  # type: ignore
+        decoded_token = contents.decoded_token  # type: ignore
         if decoded_token in VALID_ACTIVATION_TOKENS:
             token_as_int = int(decoded_token)
             probabilities_by_distribution_value[token_as_int] = np.exp(logprob)
@@ -134,7 +134,7 @@ def compute_predicted_activation_stats_for_token(
 
 
 def parse_simulation_response(
-    response: dict[str, Any],
+    response: Any,
     tokenized_prompt: list[int],
     tab_token: int,
     tokens: Sequence[str],
@@ -250,11 +250,11 @@ async def simulate(
         else:
             assert isinstance(prompt, str)
 
-        response = await self.client.generate(prompt, **sampling_params)
-        tokenized_prompt = self.client.tokenizer.apply_chat_template(
+        response = await self.client.generate(prompt, **sampling_params)  # type: ignore
+        tokenized_prompt = self.client.tokenizer.apply_chat_template(  # type: ignore
             prompt, add_generation_prompt=True
         )
-        tab_token = self.client.tokenizer.encode("\t")[1]
+        tab_token = self.client.tokenizer.encode("\t")[1]  # type: ignore
         logger.debug("response in score_explanation_by_activations is %s", response)
         try:
             result = parse_simulation_response(
@@ -287,7 +287,7 @@ def make_simulation_prompt(
         # Consider reconciling them.
         prompt_builder = PromptBuilder()
         prompt_builder.add_message(
-            "system",
+            "system",  # type: ignore
             """We're studying neurons in a neural network.
 Each neuron looks for some particular thing in a short document.
 Look at summary of what the neuron does, and try to predict how it will fire on each token.
@@ -299,7 +299,7 @@ def make_simulation_prompt(
         few_shot_examples = self.few_shot_example_set.get_examples()
         for i, example in enumerate(few_shot_examples):
             prompt_builder.add_message(
-                "user",
+                "user",  # type: ignore
                 f"\n\nNeuron {i + 1}\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX}"
                 f"{example.explanation}",
             )
@@ -309,17 +309,17 @@ def make_simulation_prompt(
                 start_indices=example.first_revealed_activation_indices,
             )
             prompt_builder.add_message(
-                "assistant", f"\nActivations: {formatted_activation_records}\n"
+                "assistant", f"\nActivations: {formatted_activation_records}\n"  # type: ignore
             )
 
         prompt_builder.add_message(
-            "user",
+            "user",  # type: ignore
             f"\n\nNeuron {len(few_shot_examples) + 1}\nExplanation of neuron "
             f"{len(few_shot_examples) + 1} behavior: {EXPLANATION_PREFIX} "
             f"{self.explanation.strip()}",
         )
         prompt_builder.add_message(
-            "assistant",
+            "assistant",  # type: ignore
             f"\nActivations: {format_sequences_for_simulation([tokens])}",
         )
         return prompt_builder.build(self.prompt_format)
@@ -595,7 +595,7 @@ async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:
 
         result = SequenceSimulation(
             activation_scale=ActivationScale.SIMULATED_NORMALIZED_ACTIVATIONS,
-            expected_activations=predicted_activations,
+            expected_activations=predicted_activations,  # type: ignore
             # Since the predicted activation is just a sampled token, we don't have a distribution.
             distribution_values=[],
             distribution_probabilities=[],
@@ -614,7 +614,7 @@ def _make_simulation_prompt_json(
         assert explanation != ""
         prompt_builder = PromptBuilder()
         prompt_builder.add_message(
-            "system",
+            "system",  # type: ignore
             """We're studying neurons in a neural network. Each neuron looks for certain things in a short document. Your task is to read the explanation of what the neuron does, and predict the neuron's activations for each token in the document.
 
 For each document, you will see the full text of the document, then the tokens in the document with the activation left blank. You will print, in valid json, the exact same tokens verbatim, but with the activation values filled in according to the explanation. Pay special attention to the explanation's description of the context and order of tokens or words.
@@ -638,7 +638,7 @@ def _make_simulation_prompt_json(
             }
             """
             prompt_builder.add_message(
-                "user",
+                "user",  # type: ignore
                 _format_record_for_logprob_free_simulation_json(
                     explanation=example.explanation,
                     activation_record=example.activation_records[0],
@@ -658,7 +658,7 @@ def _make_simulation_prompt_json(
             }
             """
             prompt_builder.add_message(
-                "assistant",
+                "assistant",  # type: ignore
                 _format_record_for_logprob_free_simulation_json(
                     explanation=example.explanation,
                     activation_record=example.activation_records[0],
@@ -678,10 +678,10 @@ def _make_simulation_prompt_json(
         }
         """
         prompt_builder.add_message(
-            "user",
+            "user",  # type: ignore
             _format_record_for_logprob_free_simulation_json(
                 explanation=explanation,
-                activation_record=ActivationRecord(tokens=tokens, activations=[]),
+                activation_record=ActivationRecord(tokens=tokens, activations=[]),  # type: ignore
                 include_activations=False,
             ),
         )
@@ -698,7 +698,7 @@ def _make_simulation_prompt(
         assert explanation != ""
         prompt_builder = PromptBuilder()
         prompt_builder.add_message(
-            "system",
+            "system",  # type: ignore
             """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.
 
 The activation format is token<tab>activation, and activations range from 0 to 10. Most activations will be 0.
@@ -716,7 +716,7 @@ def _make_simulation_prompt(
                 example.activation_records[0], include_activations=False
             )
             prompt_builder.add_message(
-                "user",
+                "user",  # type: ignore
                 f"Neuron {i + 1}\nExplanation of neuron {i + 1} behavior: {EXPLANATION_PREFIX} "
                 f"{example.explanation}\n\n"
                 f"Sequence 1 Tokens without Activations:\n{tokens_without_activations}\n\n"
@@ -728,7 +728,7 @@ def _make_simulation_prompt(
                 max_activation=few_shot_example_max_activation,
             )
             prompt_builder.add_message(
-                "assistant",
+                "assistant",  # type: ignore
                 f"{tokens_with_activations}\n\n",
             )
 
@@ -737,7 +737,7 @@ def _make_simulation_prompt(
                     record, include_activations=False
                 )
                 prompt_builder.add_message(
-                    "user",
+                    "user",  # type: ignore
                     f"Sequence {record_index + 2} Tokens without Activations:\n{tks_without}\n\n"
                     f"Sequence {record_index + 2} Tokens with Activations:\n",
                 )
@@ -747,16 +747,16 @@ def _make_simulation_prompt(
                     max_activation=few_shot_example_max_activation,
                 )
                 prompt_builder.add_message(
-                    "assistant",
+                    "assistant",  # type: ignore
                     f"{tokens_with_activations}\n\n",
                 )
 
         neuron_index = len(few_shot_examples) + 1
         tokens_without_activations = _format_record_for_logprob_free_simulation(
-            ActivationRecord(tokens=tokens, activations=[]), include_activations=False
+            ActivationRecord(tokens=tokens, activations=[]), include_activations=False  # type: ignore
         )
         prompt_builder.add_message(
-            "user",
+            "user",  # type: ignore
             f"Neuron {neuron_index}\nExplanation of neuron {neuron_index} behavior: {EXPLANATION_PREFIX} "
             f"{explanation}\n\n"
             f"Sequence 1 Tokens without Activations:\n{tokens_without_activations}\n\n"
diff --git a/delphi/scorers/surprisal/surprisal.py b/delphi/scorers/surprisal/surprisal.py
@@ -3,10 +3,13 @@
 from typing import NamedTuple
 
 import torch
+from simple_parsing import field
 from torch.nn.functional import cross_entropy
 from transformers import PreTrainedTokenizer
 
-from ...latents import Example, LatentRecord
+from delphi.utils import assert_type
+
+from ...latents import ActivatingExample, Example, LatentRecord
 from ..scorer import Scorer, ScorerResult
 from .prompts import BASEPROMPT as base_prompt
 
@@ -19,13 +22,13 @@ class SurprisalOutput:
     distance: float | int
     """Quantile or neighbor distance"""
 
-    no_explanation: list[float] = 0
+    no_explanation: list[float] = field(default_factory=list)
     """What is the surprisal of the model with no explanation"""
 
-    explanation: list[float] = 0
+    explanation: list[float] = field(default_factory=list)
     """What is the surprisal of the model with an explanation"""
 
-    activations: list[float] = 0
+    activations: list[float] = field(default_factory=list)
     """What are the activations of the model"""
 
 
@@ -55,7 +58,7 @@ def __init__(
     async def __call__(
         self,
         record: LatentRecord,
-    ) -> list[SurprisalOutput]:
+    ) -> ScorerResult:
         samples = self._prepare(record)
 
         random.shuffle(samples)
@@ -66,21 +69,24 @@ async def __call__(
 
         return ScorerResult(record=record, score=results)
 
-    def _prepare(self, record: LatentRecord) -> list[list[Sample]]:
+    def _prepare(self, record: LatentRecord) -> list[Sample]:
         """
         Prepare and shuffle a list of samples for classification.
         """
 
         defaults = {
             "tokenizer": self.tokenizer,
         }
+
+        assert record.extra_examples is not None, "No extra examples provided"
         samples = examples_to_samples(
             record.extra_examples,
             distance=-1,
             **defaults,
         )
 
         for i, examples in enumerate(record.test):
+            examples = assert_type(list, examples)
             samples.extend(
                 examples_to_samples(
                     examples,
@@ -181,7 +187,7 @@ def _query(self, explanation: str, samples: list[Sample]) -> list[SurprisalOutpu
 
 
 def examples_to_samples(
-    examples: list[Example],
+    examples: list[Example] | list[ActivatingExample],
     tokenizer: PreTrainedTokenizer,
     **sample_kwargs,
 ) -> list[Sample]:
diff --git a/delphi/sparse_coders/load_sparsify.py b/delphi/sparse_coders/load_sparsify.py
@@ -3,19 +3,21 @@
 from typing import Callable
 
 import torch
-from sparsify import Sae
+from sparsify import SparseCoder
 from torch import Tensor
 from transformers import PreTrainedModel
 
 
-def sae_dense_latents(x: Tensor, sae: Sae) -> Tensor:
+def sae_dense_latents(x: Tensor, sae: SparseCoder) -> Tensor:
     """Run `sae` on `x`, yielding the dense activations."""
     pre_acts = sae.pre_acts(x)
     acts, indices = sae.select_topk(pre_acts)
     return torch.zeros_like(pre_acts).scatter_(-1, indices, acts)
 
 
-def resolve_path(model: PreTrainedModel, path_segments: list[str]) -> list[str] | None:
+def resolve_path(
+    model: PreTrainedModel | torch.nn.Module, path_segments: list[str]
+) -> list[str] | None:
     """Attempt to resolve the path segments to the model in the case where it
     has been wrapped (e.g. by a LanguageModel, causal model, or classifier)."""
     # If the first segment is a valid attribute, return the path segments
@@ -45,7 +47,7 @@ def load_sparsify_sparse_coders(
     hookpoints: list[str],
     device: str | torch.device,
     compile: bool = False,
-) -> dict[str, Sae]:
+) -> dict[str, SparseCoder]:
     """
     Load sparsify sparse coders for specified hookpoints.
 
@@ -67,7 +69,7 @@ def load_sparsify_sparse_coders(
     name_path = Path(name)
     if name_path.exists():
         for hookpoint in hookpoints:
-            sparse_model_dict[hookpoint] = Sae.load_from_disk(
+            sparse_model_dict[hookpoint] = SparseCoder.load_from_disk(
                 name_path / hookpoint, device=device
             )
             if compile:
@@ -76,7 +78,7 @@ def load_sparsify_sparse_coders(
                 )
     else:
         # Load on CPU first to not run out of memory
-        sparse_models = Sae.load_many(name, device="cpu")
+        sparse_models = SparseCoder.load_many(name, device="cpu")
         for hookpoint in hookpoints:
             sparse_model_dict[hookpoint] = sparse_models[hookpoint].to(device)
             if compile:
diff --git a/delphi/sparse_coders/sparse_model.py b/delphi/sparse_coders/sparse_model.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn as nn
+from sparsify import SparseCoder
 from transformers import PreTrainedModel
 
 from delphi.config import RunConfig
@@ -74,7 +75,7 @@ def load_sparse_coders(
     run_cfg: RunConfig,
     device: str | torch.device,
     compile: bool = False,
-) -> dict[str, nn.Module]:
+) -> dict[str, nn.Module] | dict[str, SparseCoder]:
     """
     Load sparse coders for specified hookpoints.
 
diff --git a/delphi/tests/e2e.py b/delphi/tests/e2e.py
diff --git a/delphi/utils.py b/delphi/utils.py