diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9a332ffb08..0974a4da4e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,10 +12,22 @@ repos:
     -   id: debug-statements
     -   id: end-of-file-fixer
     -   id: no-commit-to-branch
-        args: [--branch, main]
     -   id: requirements-txt-fixer
         exclude: ^requirements-dev\.txt$
     -   id: trailing-whitespace
+- repo: https://github.com/pre-commit/pygrep-hooks
+  rev: v1.10.0
+  hooks:
+    - id: python-check-blanket-noqa
+    - id: python-check-blanket-type-ignore
+    - id: python-check-mock-methods
+    # - id: python-no-eval  # gets confused with all the `.eval()`
+    - id: python-no-log-warn
+    - id: python-use-type-annotations
+    - id: rst-backticks
+    - id: rst-directive-colons
+    - id: rst-inline-touching-normal
+    - id: text-unicode-replacement-char
 - repo: https://github.com/citation-file-format/cffconvert
   rev: 054bda51dbe278b3e86f27c890e3f3ac877d616c
   hooks:
@@ -39,14 +51,8 @@ repos:
   rev: v0.6.5
   hooks:
     - id: ruff
-      args: ["--fix", "--output-format=full"]
+      args: [--fix, --show-fixes]
     - id: ruff-format
-- repo: https://github.com/MarcoGorelli/madforhooks
-  rev: 0.4.1
-  hooks:
-    - id: no-print-statements
-      files: ^pymc/
-      exclude: (?x)(pymc/_version.py)
 - repo: local
   hooks:
     - id: check-no-tests-are-ignored
@@ -62,12 +68,6 @@ repos:
       files: ^conda-envs/environment-dev.yml$
       language: python
       name: Generate pip dependency from conda
-    - id: no-relative-imports
-      name: No relative imports
-      entry: from \.[\.\w]* import
-      types: [python]
-      language: pygrep
-      exclude: (?x)(pymc/_version.py|versioneer.py)
     - id: no-internal-links
       name: Check no links that should be cross-references are in the docs
       description: >-
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 4aa717a2dc..a82da9bc99 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -41,10 +41,10 @@ Plots, stats and diagnostics are delegated to the
 library, a general purpose library for
 "exploratory analysis of Bayesian models".
 
-* Functions from the `arviz.plots` module are available through ``pymc.<function>`` or ``pymc.plots.<function>``,
+* Functions from the ``arviz.plots`` module are available through ``pymc.<function>`` or ``pymc.plots.<function>``,
   but for their API documentation please refer to the :ref:`ArviZ documentation <arviz:plot_api>`.
 
-* Functions from the `arviz.stats` module are available through ``pymc.<function>`` or ``pymc.stats.<function>``,
+* Functions from the ``arviz.stats`` module are available through ``pymc.<function>`` or ``pymc.stats.<function>``,
   but for their API documentation please refer to the :ref:`ArviZ documentation <arviz:stats_api>`.
 
 ArviZ is a dependency of PyMC and so, in addition to the locations described above,
diff --git a/docs/source/api/distributions/discrete.rst b/docs/source/api/distributions/discrete.rst
index 79e66bab8b..76856d54df 100644
--- a/docs/source/api/distributions/discrete.rst
+++ b/docs/source/api/distributions/discrete.rst
@@ -23,4 +23,4 @@ Discrete
 .. note::
 
    **OrderedLogistic and OrderedProbit:**
-   The `OrderedLogistic` and `OrderedProbit` distributions expect the observed values to be 0-based, i.e., they should range from `0` to `K-1`. Using 1-based indexing (like `1, 2, 3,...K`) can result in errors.
+   The ``OrderedLogistic`` and ``OrderedProbit`` distributions expect the observed values to be 0-based, i.e., they should range from ``0`` to ``K-1``. Using 1-based indexing (like ``1, 2, 3,...K``) can result in errors.
diff --git a/docs/source/api/shape_utils.rst b/docs/source/api/shape_utils.rst
index 7f78052f87..586a6535d1 100644
--- a/docs/source/api/shape_utils.rst
+++ b/docs/source/api/shape_utils.rst
@@ -4,9 +4,9 @@ shape_utils
 
 This submodule contains various functions that apply numpy's broadcasting rules to shape tuples, and also to samples drawn from probability distributions.
 
-The main challenge when broadcasting samples drawn from a generative model, is that each random variate has a core shape. When we draw many i.i.d samples from a given RV, for example if we ask for `size_tuple` i.i.d draws, the result usually is a `size_tuple + RV_core_shape`. In the generative model's hierarchy, the downstream RVs that are conditionally dependent on our above sampled values, will get an array with a shape that is inconsistent with the core shape they expect to see for their parameters. This is a problem sometimes because it prevents regular broadcasting in complex hierarchical models, and thus make prior and posterior predictive sampling difficult.
+The main challenge when broadcasting samples drawn from a generative model, is that each random variate has a core shape. When we draw many i.i.d samples from a given RV, for example if we ask for ``size_tuple`` i.i.d draws, the result usually is a ``size_tuple + RV_core_shape``. In the generative model's hierarchy, the downstream RVs that are conditionally dependent on our above sampled values, will get an array with a shape that is inconsistent with the core shape they expect to see for their parameters. This is a problem sometimes because it prevents regular broadcasting in complex hierarchical models, and thus make prior and posterior predictive sampling difficult.
 
-This module introduces functions that are made aware of the requested `size_tuple` of i.i.d samples, and does the broadcasting on the core shapes, transparently ignoring or moving the i.i.d `size_tuple` prepended axes around.
+This module introduces functions that are made aware of the requested ``size_tuple`` of i.i.d samples, and does the broadcasting on the core shapes, transparently ignoring or moving the i.i.d ``size_tuple`` prepended axes around.
 
 .. currentmodule:: pymc.distributions.shape_utils
 
diff --git a/docs/source/guides/Gaussian_Processes.rst b/docs/source/guides/Gaussian_Processes.rst
index 3d1fbc80b3..513caac7fd 100644
--- a/docs/source/guides/Gaussian_Processes.rst
+++ b/docs/source/guides/Gaussian_Processes.rst
@@ -126,7 +126,7 @@ variable models and also some fast approximations.  Their usage all follows a
 similar pattern:  First, a GP is instantiated with a mean function and a
 covariance function.  Then, GP objects can be added together, allowing for
 function characteristics to be carefully modeled and separated.  Finally, one
-of `prior`, `marginal_likelihood` or `conditional` methods is called on the GP
+of ``prior``, ``marginal_likelihood`` or ``conditional`` methods is called on the GP
 object to actually construct the PyMC random variable that represents the
 function prior.
 
@@ -148,7 +148,7 @@ conditioned on.
   or other, depending on the implementation.  See the notebooks for examples.
   The :code:`conditional` method works similarly.
 
-Calling the `prior` method will create a PyMC random variable that represents
+Calling the ``prior`` method will create a PyMC random variable that represents
 the latent function :math:`f(x) = \mathbf{f}`::
 
     f = gp.prior("f", X)
@@ -218,7 +218,7 @@ thesis <https://www.cs.toronto.edu/~duvenaud/thesis.pdf>`_.
 
 The GP objects in PyMC keeps track of these marginals automatically.  The
 following code sketch shows how to define the conditional distribution of
-:math:`f_2^*`.  We use `gp.Marginal` in the example, but the same works for
+:math:`f_2^*`.  We use ``gp.Marginal`` in the example, but the same works for
 other implementations.  The first block fits the GP prior.  We denote
 :math:`f_1 + f_2` as just :math:`f` for brevity::
 
@@ -255,7 +255,7 @@ arguments are required for conditionals of :math:`f1` and :math:`f2`, but not
 
 .. note::
   When constructing conditionals, the additional arguments :code:`X`, :code:`y`,
-  :code:`noise` and :code:`gp` must be provided as a dict called `given`!
+  :code:`noise` and :code:`gp` must be provided as a dict called ``given``!
 
 Since the marginal likelihoood method of :code:`gp1` or :code:`gp2` weren't called,
 their conditionals need to be provided with the required inputs.  In the same
diff --git a/docs/source/learn/core_notebooks/Gaussian_Processes.rst b/docs/source/learn/core_notebooks/Gaussian_Processes.rst
index f076a6f651..46f9974481 100644
--- a/docs/source/learn/core_notebooks/Gaussian_Processes.rst
+++ b/docs/source/learn/core_notebooks/Gaussian_Processes.rst
@@ -123,8 +123,8 @@ variable models and also some fast approximations.  Their usage all follows a
 similar pattern:  First, a GP is instantiated with a mean function and a
 covariance function.  Then, GP objects can be added together, allowing for
 function characteristics to be carefully modeled and separated.  Finally, one
-of `prior`, `marginal_likelihood` or `conditional` methods is called on the GP
-object to actually construct the PyMC random variable that represents the
+of ``prior``, ``marginal_likelihood`` or ``conditional`` methods is called on
+the GP object to actually construct the PyMC random variable that represents the
 function prior.
 
 Using :code:`gp.Latent` for the example, the syntax to first specify the GP
@@ -145,7 +145,7 @@ conditioned on.
   or other, depending on the implementation.  See the notebooks for examples.
   The :code:`conditional` method works similarly.
 
-Calling the `prior` method will create a PyMC random variable that represents
+Calling the ``prior`` method will create a PyMC random variable that represents
 the latent function :math:`f(x) = \mathbf{f}`::
 
     f = gp.prior("f", X)
@@ -217,7 +217,7 @@ thesis <https://www.cs.toronto.edu/~duvenaud/thesis.pdf>`_.
 
 The GP objects in PyMC keeps track of these marginals automatically.  The
 following code sketch shows how to define the conditional distribution of
-:math:`f_2^*`.  We use `gp.Marginal` in the example, but the same works for
+:math:`f_2^*`.  We use ``gp.Marginal`` in the example, but the same works for
 other implementations.  The first block fits the GP prior.  We denote
 :math:`f_1 + f_2` as just :math:`f` for brevity::
 
@@ -254,7 +254,7 @@ arguments are required for conditionals of :math:`f1` and :math:`f2`, but not
 
 .. note::
   When constructing conditionals, the additional arguments :code:`X`, :code:`y`,
-  :code:`sigma` and :code:`gp` must be provided as a dict called `given`!
+  :code:`sigma` and :code:`gp` must be provided as a dict called ``given``!
 
 Since the marginal likelihoood method of :code:`gp1` or :code:`gp2` weren't called,
 their conditionals need to be provided with the required inputs.  In the same
diff --git a/pymc/backends/__init__.py b/pymc/backends/__init__.py
index 2f58b7ed8a..9130ab2554 100644
--- a/pymc/backends/__init__.py
+++ b/pymc/backends/__init__.py
@@ -85,8 +85,8 @@
     RunType: TypeAlias = Run
     HAS_MCB = True
 except ImportError:
-    TraceOrBackend = BaseTrace  # type: ignore
-    RunType = type(None)  # type: ignore
+    TraceOrBackend = BaseTrace  # type: ignore[misc]
+    RunType = type(None)  # type: ignore[assignment, misc]
 
 
 __all__ = ["to_inference_data", "predictions_to_inference_data"]
diff --git a/pymc/distributions/shape_utils.py b/pymc/distributions/shape_utils.py
index 0cedd5da77..808a6f3be4 100644
--- a/pymc/distributions/shape_utils.py
+++ b/pymc/distributions/shape_utils.py
@@ -260,13 +260,13 @@ def change_dist_size(
 
     """
     # Check the dimensionality of the `new_size` kwarg
-    new_size_ndim = np.ndim(new_size)  # type: ignore
+    new_size_ndim = np.ndim(new_size)  # type: ignore[arg-type]
     if new_size_ndim > 1:
         raise ShapeError("The `new_size` must be ≤1-dimensional.", actual=new_size_ndim)
     elif new_size_ndim == 0:
-        new_size = (new_size,)  # type: ignore
+        new_size = (new_size,)  # type: ignore[assignment]
     else:
-        new_size = tuple(new_size)  # type: ignore
+        new_size = tuple(new_size)  # type: ignore[arg-type]
 
     op = dist.owner.op
     new_dist = _change_dist_size(op, dist, new_size=new_size, expand=expand)
@@ -331,7 +331,7 @@ def change_specify_shape_size(op, ss, new_size, expand) -> TensorVariable:
             new_shapes[-ndim_supp:] = shapes[-ndim_supp:]
 
     # specify_shape has a wrong signature https://github.com/aesara-devs/aesara/issues/1164
-    return pt.specify_shape(new_var, new_shapes)  # type: ignore
+    return pt.specify_shape(new_var, new_shapes)  # type: ignore[arg-type]
 
 
 def get_support_shape(
@@ -395,8 +395,7 @@ def get_support_shape(
             raise ValueError(f"Number of dims is too small for ndim_supp of {ndim_supp}")
         model = modelcontext(None)
         inferred_support_shape = [
-            model.dim_lengths[dims[i]] - support_shape_offset[i]  # type: ignore
-            for i in np.arange(-ndim_supp, 0)
+            model.dim_lengths[dims[i]] - support_shape_offset[i] for i in np.arange(-ndim_supp, 0)
         ]
 
     if inferred_support_shape is None and observed is not None:
diff --git a/pymc/distributions/simulator.py b/pymc/distributions/simulator.py
index dc7700f7d5..dea1cf234d 100644
--- a/pymc/distributions/simulator.py
+++ b/pymc/distributions/simulator.py
@@ -145,7 +145,7 @@ def __new__(cls, name, *args, **kwargs):
         return super().__new__(cls, name, *args, **kwargs)
 
     @classmethod
-    def dist(  # type: ignore
+    def dist(  # type: ignore[override]
         cls,
         fn,
         *unnamed_params,
@@ -257,7 +257,7 @@ def rv_op(
         return sim_op(*params, **kwargs)
 
 
-@_support_point.register(SimulatorRV)  # type: ignore
+@_support_point.register(SimulatorRV)
 def simulator_support_point(op, rv, *inputs):
     sim_inputs = op.dist_params(rv.owner)
     # Take the mean of 10 draws
diff --git a/pymc/distributions/transforms.py b/pymc/distributions/transforms.py
index 2c4e121b47..4bb3f7e0be 100644
--- a/pymc/distributions/transforms.py
+++ b/pymc/distributions/transforms.py
@@ -21,7 +21,7 @@
 
 # ignore mypy error because it somehow considers that
 # "numpy.core.numeric has no attribute normalize_axis_tuple"
-from numpy.core.numeric import normalize_axis_tuple  # type: ignore
+from numpy.core.numeric import normalize_axis_tuple  # type: ignore[attr-defined]
 from pytensor.graph import Op
 from pytensor.tensor import TensorVariable
 
diff --git a/pymc/gp/cov.py b/pymc/gp/cov.py
index d7f5c66569..79cbd2d9d0 100644
--- a/pymc/gp/cov.py
+++ b/pymc/gp/cov.py
@@ -147,7 +147,7 @@ def __array_wrap__(self, result):
 
     @staticmethod
     def _alloc(X, *shape: int) -> TensorVariable:
-        return pt.alloc(X, *shape)  # type: ignore
+        return pt.alloc(X, *shape)  # type: ignore[return-value]
 
 
 class Covariance(BaseCovariance):
diff --git a/pymc/gp/hsgp_approx.py b/pymc/gp/hsgp_approx.py
index f1adf331dd..3ea0a626cd 100644
--- a/pymc/gp/hsgp_approx.py
+++ b/pymc/gp/hsgp_approx.py
@@ -425,7 +425,7 @@ def prior(
         gp_dims: str | None = None,
         *args,
         **kwargs,
-    ):  # type: ignore
+    ):
         R"""
         Returns the (approximate) GP prior distribution evaluated over the input locations `X`.
         For usage examples, refer to `pm.gp.Latent`.
@@ -488,7 +488,7 @@ def _build_conditional(self, Xnew):
         elif self._parametrization == "centered":
             return self.mean_func(Xnew) + phi[:, i:] @ beta
 
-    def conditional(self, name: str, Xnew: TensorLike, dims: str | None = None):  # type: ignore
+    def conditional(self, name: str, Xnew: TensorLike, dims: str | None = None):  # type: ignore[override]
         R"""
         Returns the (approximate) conditional distribution evaluated over new input locations
         `Xnew`.
@@ -683,7 +683,7 @@ def prior_linearized(self, X: TensorLike):
         psd = self.scale * self.cov_func.power_spectral_density_approx(J)
         return (phi_cos, phi_sin), psd
 
-    def prior(self, name: str, X: TensorLike, dims: str | None = None):  # type: ignore
+    def prior(self, name: str, X: TensorLike, dims: str | None = None):  # type: ignore[override]
         R"""
         Returns the (approximate) GP prior distribution evaluated over the input locations `X`.
         For usage examples, refer to `pm.gp.Latent`.
@@ -705,8 +705,8 @@ def prior(self, name: str, X: TensorLike, dims: str | None = None):  # type: ign
         # and so does not contribute to the approximation.
         f = (
             self.mean_func(X)
-            + phi_cos @ (psd * self._beta[:m])  # type: ignore
-            + phi_sin[..., 1:] @ (psd[1:] * self._beta[m:])  # type: ignore
+            + phi_cos @ (psd * self._beta[:m])  # type: ignore[index]
+            + phi_sin[..., 1:] @ (psd[1:] * self._beta[m:])  # type: ignore[index]
         )
 
         self.f = pm.Deterministic(name, f, dims=dims)
@@ -734,7 +734,7 @@ def _build_conditional(self, Xnew):
         phi = phi_cos @ (psd * beta[:m]) + phi_sin[..., 1:] @ (psd[1:] * beta[m:])
         return self.mean_func(Xnew) + phi
 
-    def conditional(self, name: str, Xnew: TensorLike, dims: str | None = None):  # type: ignore
+    def conditional(self, name: str, Xnew: TensorLike, dims: str | None = None):  # type: ignore[override]
         R"""
         Returns the (approximate) conditional distribution evaluated over new input locations
         `Xnew`.
diff --git a/pymc/logprob/basic.py b/pymc/logprob/basic.py
index d8188f2644..b30a2d7d92 100644
--- a/pymc/logprob/basic.py
+++ b/pymc/logprob/basic.py
@@ -592,7 +592,7 @@ def transformed_conditional_logp(
     }
     if values_to_transforms:
         # There seems to be an incorrect type hint in TransformValuesRewrite
-        transform_rewrite = TransformValuesRewrite(values_to_transforms)  # type: ignore
+        transform_rewrite = TransformValuesRewrite(values_to_transforms)  # type: ignore[arg-type]
 
     kwargs.setdefault("warn_rvs", False)
     temp_logp_terms = conditional_logp(
diff --git a/pymc/logprob/tensor.py b/pymc/logprob/tensor.py
index 750ace5694..a852466fc5 100644
--- a/pymc/logprob/tensor.py
+++ b/pymc/logprob/tensor.py
@@ -34,7 +34,6 @@
 #   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 #   SOFTWARE.
 
-
 from pathlib import Path
 
 from pytensor import tensor as pt
@@ -165,7 +164,7 @@ def find_measurable_stacks(fgraph, node) -> list[TensorVariable] | None:
     # the IR construction
     replacements = [(base_var, promised_valued_rv(base_var)) for base_var in base_vars]
     temp_fgraph = FunctionGraph(outputs=base_vars, clone=False)
-    toposort_replace(temp_fgraph, replacements)  # type: ignore
+    toposort_replace(temp_fgraph, replacements)  # type: ignore[arg-type]
     new_base_vars = temp_fgraph.outputs
 
     if is_join:
@@ -182,7 +181,7 @@ class MeasurableDimShuffle(MeasurableOp, DimShuffle):
 
     # Need to get the absolute path of `c_func_file`, otherwise it tries to
     # find it locally and fails when a new `Op` is initialized
-    c_func_file = str(DimShuffle.get_path(Path(DimShuffle.c_func_file)))
+    c_func_file = str(DimShuffle.get_path(Path(DimShuffle.c_func_file)))  # type: ignore[arg-type]
 
 
 @_logprob.register(MeasurableDimShuffle)
diff --git a/pymc/math.py b/pymc/math.py
index b5fc50a8eb..b732e373cf 100644
--- a/pymc/math.py
+++ b/pymc/math.py
@@ -12,7 +12,6 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
-import sys
 import warnings
 
 from functools import partial, reduce
@@ -386,8 +385,7 @@ def perform(self, node, inputs, outputs, params=None):
             log_det = np.sum(np.log(np.abs(s)))
             z[0] = np.asarray(log_det, dtype=x.dtype)
         except Exception:
-            print(f"Failed to compute logdet of {x}.", file=sys.stdout)
-            raise
+            raise ValueError(f"Failed to compute logdet of {x}.")
 
     def grad(self, inputs, g_outputs):
         [gz] = g_outputs
diff --git a/pymc/model/transform/optimization.py b/pymc/model/transform/optimization.py
index 32ea617482..bcf828ba3e 100644
--- a/pymc/model/transform/optimization.py
+++ b/pymc/model/transform/optimization.py
@@ -77,14 +77,14 @@ def freeze_dims_and_data(
         if isinstance(datum, SharedVariable)
     }
 
-    old_outs, old_coords, old_dim_lenghts = fg.outputs, fg._coords, fg._dim_lengths  # type: ignore
+    old_outs, old_coords, old_dim_lenghts = fg.outputs, fg._coords, fg._dim_lengths  # type: ignore[attr-defined]
     # Rebuild strict will force the recreation of RV nodes with updated static types
-    new_outs = clone_replace(old_outs, replace=frozen_replacements, rebuild_strict=False)  # type: ignore
+    new_outs = clone_replace(old_outs, replace=frozen_replacements, rebuild_strict=False)  # type: ignore[arg-type]
     for old_out, new_out in zip(old_outs, new_outs):
         new_out.name = old_out.name
     fg = FunctionGraph(outputs=new_outs, clone=False)
-    fg._coords = old_coords  # type: ignore
-    fg._dim_lengths = {  # type: ignore
+    fg._coords = old_coords  # type: ignore[attr-defined]
+    fg._dim_lengths = {  # type: ignore[attr-defined]
         dim: frozen_replacements.get(dim_length, dim_length)
         for dim, dim_length in old_dim_lenghts.items()
     }
@@ -99,7 +99,7 @@ def freeze_dims_and_data(
         if transform is None:
             new_value = rv.type()
         else:
-            new_value = transform.forward(rv, *rv.owner.inputs).type()  # type: ignore
+            new_value = transform.forward(rv, *rv.owner.inputs).type()  # type: ignore[arg-type]
         new_value.name = old_value.name
         replacements[old_value] = new_value
     fg.replace_all(tuple(replacements.items()), import_missing=True)
diff --git a/pymc/model_graph.py b/pymc/model_graph.py
index 6596477261..853f4681ec 100644
--- a/pymc/model_graph.py
+++ b/pymc/model_graph.py
@@ -234,7 +234,7 @@ def _make_node(
         kwargs["cluster"] = cluster
 
     var_name: str = cast(str, node.var.name)
-    add_node(var_name.replace(":", "&"), **kwargs)  # type: ignore
+    add_node(var_name.replace(":", "&"), **kwargs)  # type: ignore[call-arg]
 
 
 class ModelGraph:
diff --git a/pymc/sampling/forward.py b/pymc/sampling/forward.py
index c1504091e5..d6bea04aa6 100644
--- a/pymc/sampling/forward.py
+++ b/pymc/sampling/forward.py
@@ -219,7 +219,7 @@ def shared_value_matches(var):
     # Walk the graph from inputs to outputs and tag the volatile variables
     nodes: list[Variable] = general_toposort(
         fg.outputs, deps=lambda x: x.owner.inputs if x.owner else []
-    )  # type: ignore
+    )  # type: ignore[call-overload]
     volatile_nodes: set[Any] = set()
     for node in nodes:
         if (
@@ -446,7 +446,7 @@ def sample_prior_predictive(
     )
 
     # All model variables have a name, but mypy does not know this
-    _log.info(f"Sampling: {sorted(volatile_basic_rvs, key=lambda var: var.name)}")  # type: ignore
+    _log.info(f"Sampling: {sorted(volatile_basic_rvs, key=lambda var: var.name)}")  # type: ignore[arg-type, return-value]
     values = zip(*(sampler_fn() for i in range(draws)))
 
     data = {k: np.stack(v) for k, v in zip(names, values)}
@@ -850,7 +850,7 @@ def sample_posterior_predictive(
     )
     sampler_fn = point_wrapper(_sampler_fn)
     # All model variables have a name, but mypy does not know this
-    _log.info(f"Sampling: {sorted(volatile_basic_rvs, key=lambda var: var.name)}")  # type: ignore
+    _log.info(f"Sampling: {sorted(volatile_basic_rvs, key=lambda var: var.name)}")  # type: ignore[arg-type, return-value]
     ppc_trace_t = _DefaultTrace(samples)
 
     progress = CustomProgress(
diff --git a/pymc/sampling/mcmc.py b/pymc/sampling/mcmc.py
index 228850e63e..41ef987308 100644
--- a/pymc/sampling/mcmc.py
+++ b/pymc/sampling/mcmc.py
@@ -221,7 +221,7 @@ def assign_step_methods(
             has_gradient = getattr(var, "dtype") not in discrete_types
             if has_gradient:
                 try:
-                    tg.grad(model_logp, var)  # type: ignore
+                    tg.grad(model_logp, var)  # type: ignore[arg-type]
                 except (NotImplementedError, tg.NullTypeGradError):
                     has_gradient = False
 
@@ -229,7 +229,7 @@ def assign_step_methods(
             rv_var = model.values_to_rvs[var]
             selected = max(
                 methods_list,
-                key=lambda method, var=rv_var, has_gradient=has_gradient: method._competence(  # type: ignore
+                key=lambda method, var=rv_var, has_gradient=has_gradient: method._competence(  # type: ignore[misc]
                     var, has_gradient
                 ),
             )
diff --git a/pymc/sampling/parallel.py b/pymc/sampling/parallel.py
index 4b76e53a97..92866396ad 100644
--- a/pymc/sampling/parallel.py
+++ b/pymc/sampling/parallel.py
@@ -363,8 +363,8 @@ def terminate_all(processes, patience=2):
                     raise multiprocessing.TimeoutError()
                 process.join(timeout)
         except multiprocessing.TimeoutError:
-            logger.warn(
-                "Chain processes did not terminate as expected. " "Terminating forcefully..."
+            logger.warning(
+                "Chain processes did not terminate as expected. Terminating forcefully..."
             )
             for process in processes:
                 process.terminate()
diff --git a/pymc/step_methods/metropolis.py b/pymc/step_methods/metropolis.py
index 21fb6c83e8..447da340eb 100644
--- a/pymc/step_methods/metropolis.py
+++ b/pymc/step_methods/metropolis.py
@@ -916,8 +916,8 @@ def astep(self, q0: RaveledVars) -> tuple[RaveledVars, StatsType]:
         if self.other_chains is None:  # pragma: no cover
             raise RuntimeError("Population sampler has not been linked to the other chains")
         ir1, ir2 = self.rng.choice(self.other_chains, 2, replace=False)
-        r1 = DictToArrayBijection.map(self.population[ir1])  # type: ignore
-        r2 = DictToArrayBijection.map(self.population[ir2])  # type: ignore
+        r1 = DictToArrayBijection.map(self.population[ir1])  # type: ignore[index]
+        r2 = DictToArrayBijection.map(self.population[ir2])  # type: ignore[index]
         # propose a jump
         q = floatX(q0d + self.lamb * (r1.data - r2.data) + epsilon)
 
diff --git a/pymc/testing.py b/pymc/testing.py
index 7a43c63766..75932b7c08 100644
--- a/pymc/testing.py
+++ b/pymc/testing.py
@@ -214,7 +214,7 @@ def RandomPdMatrix(n):
 Rdunif = Domain([-np.inf, -1, 0, 1, np.inf], "int64")
 Rplusunif = Domain([0, 0.5, np.inf])
 Rplusdunif = Domain([0, 10, np.inf], "int64")
-I = Domain([-np.inf, -3, -2, -1, 0, 1, 2, 3, np.inf], "int64")  # noqa E741
+I = Domain([-np.inf, -3, -2, -1, 0, 1, 2, 3, np.inf], "int64")  # noqa: E741
 NatSmall = Domain([0, 3, 4, 5, np.inf], "int64")
 Nat = Domain([0, 1, 2, 3, np.inf], "int64")
 NatBig = Domain([0, 1, 2, 3, 5000, np.inf], "int64")
diff --git a/pymc/tuning/scaling.py b/pymc/tuning/scaling.py
index 08d267adb5..1438436c52 100644
--- a/pymc/tuning/scaling.py
+++ b/pymc/tuning/scaling.py
@@ -100,7 +100,7 @@ def adjust_precision(tau, scaling_bound=1e-8):
     return exp(bounded) ** 2
 
 
-def bound(a, l, u):  # noqa E741
+def bound(a, l, u):  # noqa: E741
     return np.maximum(np.minimum(a, u), l)
 
 
diff --git a/pymc/tuning/starting.py b/pymc/tuning/starting.py
index cb8ae010d7..9fa5e54b55 100644
--- a/pymc/tuning/starting.py
+++ b/pymc/tuning/starting.py
@@ -18,7 +18,6 @@
 @author: johnsalvatier
 """
 
-import sys
 import warnings
 
 from collections.abc import Sequence
@@ -142,7 +141,7 @@ def find_MAP(
     # TODO: If the mapping is fixed, we can simply create graphs for the
     # mapping and avoid all this bijection overhead
     compiled_logp_func = DictToArrayBijection.mapf(model.compile_logp(jacobian=False), start)
-    logp_func = lambda x: compiled_logp_func(RaveledVars(x, x0.point_map_info))  # noqa E731
+    logp_func = lambda x: compiled_logp_func(RaveledVars(x, x0.point_map_info))  # noqa: E731
 
     rvs = [model.values_to_rvs[vars_dict[name]] for name, _, _ in x0.point_map_info]
     try:
@@ -151,7 +150,7 @@ def find_MAP(
         compiled_dlogp_func = DictToArrayBijection.mapf(
             model.compile_dlogp(rvs, jacobian=False), start
         )
-        dlogp_func = lambda x: compiled_dlogp_func(RaveledVars(x, x0.point_map_info))  # noqa E731
+        dlogp_func = lambda x: compiled_dlogp_func(RaveledVars(x, x0.point_map_info))  # noqa: E731
         compute_gradient = True
     except (AttributeError, NotImplementedError, tg.NullTypeGradError):
         compute_gradient = False
@@ -184,7 +183,6 @@ def find_MAP(
                 pm._log.info(e)
         finally:
             cost_func.progress.update(cost_func.task, completed=cost_func.n_eval, refresh=True)
-            print(file=sys.stdout)
 
     mx0 = RaveledVars(mx0, x0.point_map_info)
     unobserved_vars = get_default_varnames(model.unobserved_value_vars, include_transformed)
diff --git a/pymc/util.py b/pymc/util.py
index 7733d41b60..1ac31fa12b 100644
--- a/pymc/util.py
+++ b/pymc/util.py
@@ -99,7 +99,7 @@ def __init__(self, iterable=(), parent=None):
         if self.parent is not None:
             self.parent.extend(self)
 
-    # typechecking here works bad
+    # here typechecking works bad
     append = withparent(list.append)
     __iadd__ = withparent(list.__iadd__)
     extend = withparent(list.extend)
@@ -144,7 +144,7 @@ def __init__(self, iterable=(), parent=None, **kwargs):
         if self.parent is not None:
             self.parent.update(self)
 
-    # typechecking here works bad
+    # here typechecking works bad
     __setitem__ = withparent(dict.__setitem__)
     update = withparent(dict.update)
 
@@ -434,7 +434,7 @@ def _get_unique_seeds_per_chain(integers_fn):
         return seeds
 
     try:
-        int_random_state = int(random_state)  # type: ignore
+        int_random_state = int(random_state)  # type: ignore[arg-type]
     except Exception:
         int_random_state = None
 
diff --git a/pymc/variational/callbacks.py b/pymc/variational/callbacks.py
index 820e9d7b84..5075ae76bf 100644
--- a/pymc/variational/callbacks.py
+++ b/pymc/variational/callbacks.py
@@ -27,12 +27,12 @@ def __call__(self, approx, loss, i):
 
 
 def relative(current: np.ndarray, prev: np.ndarray, eps=1e-6) -> np.ndarray:
-    diff = current - prev  # type: ignore
+    diff = current - prev
     return (np.abs(diff) + eps) / (np.abs(prev) + eps)
 
 
 def absolute(current: np.ndarray, prev: np.ndarray) -> np.ndarray:
-    diff = current - prev  # type: ignore
+    diff = current - prev
     return np.abs(diff)
 
 
diff --git a/pymc/variational/minibatch_rv.py b/pymc/variational/minibatch_rv.py
index be71a358c9..8ee05f942c 100644
--- a/pymc/variational/minibatch_rv.py
+++ b/pymc/variational/minibatch_rv.py
@@ -84,10 +84,10 @@ def get_scaling(total_size: Sequence[Variable], shape: TensorVariable) -> Tensor
     """Gets scaling constant for logp."""
 
     # mypy doesn't understand we can convert a shape TensorVariable into a tuple
-    shape = tuple(shape)  # type: ignore
+    shape = tuple(shape)  # type: ignore[assignment]
 
     # Scalar RV
-    if len(shape) == 0:  # type: ignore
+    if len(shape) == 0:  # type: ignore[arg-type]
         coef = total_size[0] if not NoneConst.equals(total_size[0]) else 1.0
     else:
         coefs = [t / shape[i] for i, t in enumerate(total_size) if not NoneConst.equals(t)]
diff --git a/pyproject.toml b/pyproject.toml
index 770fd0e040..8032452460 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ extend-exclude = ["_version.py"]
 docstring-code-format = true
 
 [tool.ruff.lint]
-select = ["C4", "D", "E", "F", "I", "UP", "W", "RUF"]
+select = ["C4", "D", "E", "F", "I", "UP", "W", "RUF", "T20", "TID"]
 ignore = [
   "E501",
   "F841", # Local variable name is assigned to but never used
@@ -87,6 +87,12 @@ lines-between-types = 1
   "I001", # Import block is un-sorted or un-formatted
 ]
 "tests/*" = ["D"]
+"scripts/run_mypy.py" = [
+  "T201", # No print statements
+]
+"*.ipynb" = [
+  "T201", # No print statements
+]
 
 [tool.coverage.report]
 exclude_lines = [
diff --git a/tests/distributions/test_continuous.py b/tests/distributions/test_continuous.py
index 73ab07d3c1..41504816ae 100644
--- a/tests/distributions/test_continuous.py
+++ b/tests/distributions/test_continuous.py
@@ -1932,7 +1932,7 @@ def halfstudentt_rng_fn(self, df, loc, scale, size, rng):
     pymc_dist_params = {"nu": 5.0, "sigma": 2.0}
     expected_rv_op_params = {"nu": 5.0, "sigma": 2.0}
     reference_dist_params = {"df": 5.0, "loc": 0, "scale": 2.0}
-    reference_dist = lambda self: ft.partial(self.halfstudentt_rng_fn, rng=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(self.halfstudentt_rng_fn, rng=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_pymc_params_match_rv_op",
         "check_pymc_draws_match_reference",
@@ -2166,7 +2166,7 @@ def logit_normal_rng_fn(self, rng, size, loc, scale):
     pymc_dist_params = {"mu": 5.0, "sigma": 10.0}
     expected_rv_op_params = {"mu": 5.0, "sigma": 10.0}
     reference_dist_params = {"loc": 5.0, "scale": 10.0}
-    reference_dist = lambda self: ft.partial(self.logit_normal_rng_fn, rng=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(self.logit_normal_rng_fn, rng=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_pymc_params_match_rv_op",
         "check_pymc_draws_match_reference",
@@ -2237,7 +2237,7 @@ class TestBeta(BaseTestDistributionRandom):
     expected_rv_op_params = {"alpha": 2.0, "beta": 5.0}
     reference_dist_params = {"a": 2.0, "b": 5.0}
     size = 15
-    reference_dist = lambda self: ft.partial(clipped_beta_rvs, random_state=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(clipped_beta_rvs, random_state=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_pymc_params_match_rv_op",
         "check_pymc_draws_match_reference",
@@ -2443,7 +2443,7 @@ def polyagamma_rng_fn(self, size, h, z, rng):
     pymc_dist_params = {"h": 1.0, "z": 0.0}
     expected_rv_op_params = {"h": 1.0, "z": 0.0}
     reference_dist_params = {"h": 1.0, "z": 0.0}
-    reference_dist = lambda self: ft.partial(self.polyagamma_rng_fn, rng=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(self.polyagamma_rng_fn, rng=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_pymc_params_match_rv_op",
         "check_pymc_draws_match_reference",
@@ -2464,7 +2464,7 @@ def interpolated_rng_fn(self, size, mu, sigma, rng):
     pymc_dist_params = {"x_points": x_points, "pdf_points": pdf_points}
     reference_dist_params = {"mu": mu, "sigma": sigma}
 
-    reference_dist = lambda self: ft.partial(self.interpolated_rng_fn, rng=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(self.interpolated_rng_fn, rng=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_rv_size",
         "check_draws",
diff --git a/tests/distributions/test_custom.py b/tests/distributions/test_custom.py
index 45a902b8e5..5b1de16178 100644
--- a/tests/distributions/test_custom.py
+++ b/tests/distributions/test_custom.py
@@ -189,14 +189,14 @@ def logp(value, mu):
     )
     def test_custom_dist_default_support_point_univariate(self, support_point, size, expected):
         if support_point == "custom_support_point":
-            support_point = lambda rv, size, *rv_inputs: 5 * pt.ones(size, dtype=rv.dtype)  # noqa E731
+            support_point = lambda rv, size, *rv_inputs: 5 * pt.ones(size, dtype=rv.dtype)  # noqa: E731
         with Model() as model:
             x = CustomDist("x", support_point=support_point, size=size)
         assert isinstance(x.owner.op, CustomDistRV)
         assert_support_point_is_expected(model, expected, check_finite_logp=False)
 
     def test_custom_dist_moment_future_warning(self):
-        moment = lambda rv, size, *rv_inputs: 5 * pt.ones(size, dtype=rv.dtype)  # noqa E731
+        moment = lambda rv, size, *rv_inputs: 5 * pt.ones(size, dtype=rv.dtype)  # noqa: E731
         with Model() as model:
             with pytest.warns(
                 FutureWarning, match="`moment` argument is deprecated. Use `support_point` instead."
diff --git a/tests/distributions/test_discrete.py b/tests/distributions/test_discrete.py
index fa74b72944..e9be2ceded 100644
--- a/tests/distributions/test_discrete.py
+++ b/tests/distributions/test_discrete.py
@@ -853,7 +853,7 @@ def discrete_uniform_rng_fn(self, size, lower, upper, rng):
     pymc_dist_params = {"lower": -1, "upper": 9}
     expected_rv_op_params = {"lower": -1, "upper": 9}
     reference_dist_params = {"lower": -1, "upper": 9}
-    reference_dist = lambda self: ft.partial(  # noqa E731
+    reference_dist = lambda self: ft.partial(  # noqa: E731
         self.discrete_uniform_rng_fn, rng=self.get_random_state()
     )
     checks_to_run = [
diff --git a/tests/distributions/test_distribution.py b/tests/distributions/test_distribution.py
index 545bf17735..74716081ba 100644
--- a/tests/distributions/test_distribution.py
+++ b/tests/distributions/test_distribution.py
@@ -284,7 +284,7 @@ def diracdelta_rng_fn(self, size, c):
         pymc_dist_params = {"c": 3}
         expected_rv_op_params = {"c": 3}
         reference_dist_params = {"c": 3}
-        reference_dist = lambda self: self.diracdelta_rng_fn  # noqa E731
+        reference_dist = lambda self: self.diracdelta_rng_fn  # noqa: E731
         checks_to_run = [
             "check_pymc_params_match_rv_op",
             "check_pymc_draws_match_reference",
diff --git a/tests/distributions/test_multivariate.py b/tests/distributions/test_multivariate.py
index 74199fa107..1fd1b7e6d8 100644
--- a/tests/distributions/test_multivariate.py
+++ b/tests/distributions/test_multivariate.py
@@ -1797,7 +1797,7 @@ def mvstudentt_rng_fn(self, size, nu, mu, scale, rng):
         "mu": np.array([1.0, 2.0]),
         "scale": np.array([[2.0, 0.0], [0.0, 3.5]]),
     }
-    reference_dist = lambda self: ft.partial(self.mvstudentt_rng_fn, rng=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(self.mvstudentt_rng_fn, rng=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_pymc_params_match_rv_op",
         "check_pymc_draws_match_reference",
@@ -2000,7 +2000,7 @@ def wishart_rng_fn(self, size, nu, V, rng):
         (1, 3, 3),
         (4, 5, 3, 3),
     ]
-    reference_dist = lambda self: ft.partial(self.wishart_rng_fn, rng=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(self.wishart_rng_fn, rng=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_rv_size",
         "check_pymc_params_match_rv_op",
@@ -2129,7 +2129,7 @@ def kronecker_rng_fn(self, size, mu, covs=None, sigma=None, rng=None):
     sizes_to_check = [None, (), 1, (1,), 5, (4, 5), (2, 4, 2)]
     sizes_expected = [(N,), (N,), (1, N), (1, N), (5, N), (4, 5, N), (2, 4, 2, N)]
 
-    reference_dist = lambda self: ft.partial(self.kronecker_rng_fn, rng=self.get_random_state())  # noqa E731
+    reference_dist = lambda self: ft.partial(self.kronecker_rng_fn, rng=self.get_random_state())  # noqa: E731
     checks_to_run = [
         "check_pymc_draws_match_reference",
         "check_rv_size",
@@ -2382,7 +2382,7 @@ def test_mvnormal_no_cholesky_in_model_logp():
         data = np.ones((batch_size, n))
         pm.MvNormal("y", mu=mu, chol=pt.broadcast_to(chol, (batch_size, n, n)), observed=data)
 
-    contains_cholesky_op = lambda fgraph: any(  # noqa E731
+    contains_cholesky_op = lambda fgraph: any(  # noqa: E731
         isinstance(node.op, Cholesky) for node in fgraph.apply_nodes
     )
 
diff --git a/tests/distributions/test_timeseries.py b/tests/distributions/test_timeseries.py
index ca4d0eb49e..580b783d04 100644
--- a/tests/distributions/test_timeseries.py
+++ b/tests/distributions/test_timeseries.py
@@ -961,7 +961,7 @@ def _gen_sde_path(sde, pars, dt, n, x0):
                 xs.append(xs[-1] + f * dt + np.sqrt(dt) * g * wt[i])
             return np.array(xs)
 
-        sde = lambda x, lam: (lam * x, sig2)  # noqa E731
+        sde = lambda x, lam: (lam * x, sig2)  # noqa: E731
         x = floatX(_gen_sde_path(sde, (lam,), dt, N, 5.0))
         z = x + numpy_rng.standard_normal(size=x.size) * sig2
         # build model
diff --git a/tests/gp/test_cov.py b/tests/gp/test_cov.py
index db33502972..5a0d962747 100644
--- a/tests/gp/test_cov.py
+++ b/tests/gp/test_cov.py
@@ -483,7 +483,6 @@ def test_euclidean_dist(self):
                 [1, 0, 1],
             ]
         )
-        print(result, expected)
         npt.assert_allclose(result, expected, atol=1e-5)