Skip to content

Add basic statistics function #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion dask_match/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,30 @@ def index(self):
def size(self):
return Size(self)

def _statistics(self):
return {}

def statistics(self) -> dict:
"""Known quantities of an expression, like length or min/max

To define this on a class create a `._statistics` method that returns a
dictionary of new statistics known by that class. If nothing is known it
is ok to return None. Superclasses will also be consulted.

Examples
--------
>>> df.statistics()
{"length": 1000000}
"""
out = {}
for typ in type(self).mro()[::-1]:
if not issubclass(typ, Expr):
continue
d = typ._statistics(self) # TODO: maybe this should be cached
if d:
out.update(d) # TODO: this is fragile
return out

def __getitem__(self, other):
if isinstance(other, Expr):
return Filter(self, other) # df[df.x > 1]
Expand Down Expand Up @@ -468,7 +492,10 @@ class Elemwise(Blockwise):
optimizations, like `len` will care about which operations preserve length
"""

pass
def _statistics(self):
for dep in self.dependencies():
if dep.npartitions == self.npartitions and "length" in dep.statistics():
return {"length": dep.statistics()["length"]}


class AsType(Elemwise):
Expand Down
3 changes: 3 additions & 0 deletions dask_match/io/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,7 @@ def _task(self, index: int | None = None):
def __str__(self):
return "df"

def _statistics(self):
return {"length": len(self.frame)}

__repr__ = __str__
5 changes: 5 additions & 0 deletions dask_match/tests/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,3 +288,8 @@ def test_simple_graphs(df):
graph = expr.__dask_graph__()

assert graph[(expr._name, 0)] == (operator.add, (df.expr._name, 0), 1)


def test_statistics(df, pdf):
assert (df + 1).statistics()["length"] == len(pdf)
assert df[df.x > 5].statistics().get("length") is None