Skip to content

[ENH] Series search for similarity search module #2010

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ee17b62
WIP series search, need to do SeriesSearch estimator
baraline Aug 7, 2024
d9a77f8
WIP series search
baraline Aug 12, 2024
677f0ca
commit before merging type hints contributions
baraline Aug 19, 2024
5a3296b
Merge remote-tracking branch 'origin/main' into 1900-enh-series-searc…
baraline Aug 19, 2024
75f60fd
Add some missing type hints, test for series search
baraline Aug 19, 2024
e6072bb
Merge remote-tracking branch 'origin/main' into 1900-enh-series-searc…
baraline Aug 25, 2024
61a9a8e
Adding series search, update tests
baraline Aug 25, 2024
6da0c11
Fix notebooks
baraline Aug 25, 2024
b2d3d6f
Merge branch 'main' into 1900-enh-series-search-for-similarity-search…
baraline Aug 28, 2024
7a10b61
Merge remote-tracking branch 'origin/main' into 1900-enh-series-searc…
baraline Aug 29, 2024
224171f
add update dot product for stamp matrix profile, update some docstrings
baraline Aug 29, 2024
adb6ef2
Merge branch '1900-enh-series-search-for-similarity-search-module' of…
baraline Aug 29, 2024
25bf7fb
Add STOMP
baraline Sep 1, 2024
275e87d
Update test_series_search.py
baraline Sep 2, 2024
0e3889b
Update API docs and test
baraline Sep 2, 2024
b4069d6
update notebook
baraline Sep 2, 2024
08f3a75
Fix notebook
baraline Sep 2, 2024
70139ae
Merge branch 'main' into 1900-enh-series-search-for-similarity-search…
baraline Sep 2, 2024
ddc5c70
Update docstring
baraline Sep 2, 2024
efb4062
Add test for normalized
baraline Sep 3, 2024
0493754
Merge branch '1900-enh-series-search-for-similarity-search-module' of…
baraline Sep 3, 2024
0fa8e10
update mask docstrings
baraline Sep 3, 2024
34a8f59
Fix STOMP and add benchmark to check
baraline Sep 3, 2024
ef542e7
Merge branch 'main' into 1900-enh-series-search-for-similarity-search…
baraline Sep 3, 2024
934fccb
Merge branch 'main' into 1900-enh-series-search-for-similarity-search…
baraline Sep 4, 2024
2abe768
update numba list type handling
baraline Sep 7, 2024
b14658b
Merge branch '1900-enh-series-search-for-similarity-search-module' of…
baraline Sep 7, 2024
9aa4eca
update comment
baraline Sep 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion aeon/similarity_search/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Similarity search module."""

__all__ = ["BaseSimilaritySearch", "QuerySearch"]
__all__ = ["BaseSimilaritySearch", "QuerySearch", "SeriesSearch"]

from aeon.similarity_search.base import BaseSimilaritySearch
from aeon.similarity_search.query_search import QuerySearch
from aeon.similarity_search.series_search import SeriesSearch
317 changes: 317 additions & 0 deletions aeon/similarity_search/_commons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
"""Helper and common function for similarity search estimators and functions."""

__maintainer__ = ["baraline"]

import warnings

import numpy as np
from numba import njit, prange
from scipy.signal import convolve


def fft_sliding_dot_product(X, q):
"""
Use FFT convolution to calculate the sliding window dot product.

Parameters
----------
X : array, shape=(n_channels, n_timepoints)
Input time series

q : array, shape=(n_channels, query_length)
Input query

Returns
-------
out : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1)
Sliding dot product between q and X.
"""
n_channels, n_timepoints = X.shape
query_length = q.shape[1]
out = np.zeros((n_channels, n_timepoints - query_length + 1))
for i in range(n_channels):
out[i, :] = convolve(np.flipud(q[i, :]), X[i, :], mode="valid").real
return out


def get_ith_products(X, T, L, ith):
"""
Compute dot products between X and the i-th subsequence of size L in T.

Parameters
----------
X : array, shape = (n_channels, n_timepoints_X)
Input data.
T : array, shape = (n_channels, n_timepoints_T)
Data containing the query.
L : int
Overall query length.
ith : int
Query starting index in T.

Returns
-------
np.ndarray, 2D array of shape (n_channels, n_timepoints_X - L + 1)
Sliding dot product between the i-th subsequence of size L in T and X.

"""
return fft_sliding_dot_product(X, T[:, ith : ith + L])


@njit(cache=True)
def numba_roll_1D_no_warparound(array, shift, warparound_value):
"""
Roll the rows of an array.

Wheter to allow values at the end of the array to appear at the start after
being rolled out of the array length.

Parameters
----------
array : np.ndarray of shape (n_columns)
Array to roll.
shift : int
The amount of indexes the values will be rolled on each row of the array.
Must be inferior or equal to n_columns.
warparound_value : any type
A value of the type of array to insert instead of the value that got rolled
over the array length

Returns
-------
rolled_array : np.ndarray of shape (n_rows, n_columns)
The rolled array. Can also be a TypedList in the case where n_columns changes
between rows.

"""
length = array.shape[0]
_a1 = array[: length - shift]
array[shift:] = _a1
array[:shift] = warparound_value
return array


@njit(cache=True)
def numba_roll_2D_no_warparound(array, shift, warparound_value):
"""
Roll the rows of an array.

Wheter to allow values at the end of the array to appear at the start after
being rolled out of the array length.

Parameters
----------
array : np.ndarray of shape (n_rows, n_columns)
Array to roll. Can also be a TypedList in the case where n_columns changes
between rows.
shift : int
The amount of indexes the values will be rolled on each row of the array.
Must be inferior or equal to n_columns.
warparound_value : any type
A value of the type of array to insert instead of the value that got rolled
over the array length

Returns
-------
rolled_array : np.ndarray of shape (n_rows, n_columns)
The rolled array. Can also be a TypedList in the case where n_columns changes
between rows.

"""
for i in prange(len(array)):
length = len(array[i])
_a1 = array[i][: length - shift]
array[i][shift:] = _a1
array[i][:shift] = warparound_value
return array


@njit(cache=True)
def extract_top_k_and_threshold_from_distance_profiles_one_series(
distance_profiles,
id_x,
k=1,
threshold=np.inf,
exclusion_size=None,
inverse_distance=False,
):
if inverse_distance:
# To avoid div by 0 case
distance_profiles += 1e-8
distance_profiles[distance_profiles != np.inf] = (
1 / distance_profiles[distance_profiles != np.inf]
)

if threshold != np.inf:
distance_profiles[distance_profiles > threshold] = np.inf

_argsort = np.argsort(distance_profiles)

if distance_profiles[distance_profiles <= threshold].shape[0] < k:
_k = distance_profiles[distance_profiles <= threshold].shape[0]
elif _argsort.shape[0] < k:
_k = _argsort.shape[0]
else:
_k = k

if exclusion_size is None:
indexes = np.zeros((_k, 2), dtype=np.int_)
for i in range(_k):
indexes[i, 0] = id_x
indexes[i, 1] = _argsort[i]
return distance_profiles[_argsort[:_k]], indexes
else:
# Apply exclusion zone to avoid neighboring matches
top_k = np.zeros((_k, 2), dtype=np.int_) - exclusion_size
top_k_dist = np.zeros((_k), dtype=np.float_)

top_k[0, 0] = id_x
top_k[0, 1] = _argsort[0]

top_k_dist[0] = distance_profiles[_argsort[0]]

n_inserted = 1
i_current = 1

while n_inserted < _k and i_current < _argsort.shape[0]:
candidate_timestamp = _argsort[i_current]

insert = True
LB = candidate_timestamp >= (top_k[:, 1] - exclusion_size)
UB = candidate_timestamp <= (top_k[:, 1] + exclusion_size)
if np.any(UB & LB):
insert = False

if insert:
top_k[n_inserted, 0] = id_x
top_k[n_inserted, 1] = _argsort[i_current]
top_k_dist[n_inserted] = distance_profiles[_argsort[i_current]]
n_inserted += 1
i_current += 1
return top_k_dist[:n_inserted], top_k[:n_inserted]


def extract_top_k_and_threshold_from_distance_profiles(
distance_profiles,
k=1,
threshold=np.inf,
exclusion_size=None,
inverse_distance=False,
):
"""
Extract the best matches from a distance profile given k and threshold parameters.

Parameters
----------
distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates)
Precomputed distance profile. Can be a TypedList if n_candidates vary between
cases.
k : int
Number of matches to returns
threshold : float
All matches below this threshold will be returned
exclusion_size : int, optional
The size of the exclusion zone used to prevent returning as top k candidates
the ones that are close to each other (for example i and i+1).
It is used to define a region between
:math:`id_timestamp - exclusion_size` and
:math:`id_timestamp + exclusion_size` which cannot be returned
as best match if :math:`id_timestamp` was already selected. By default,
the value None means that this is not used.
inverse_distance : bool, optional
Wheter to return the worst matches instead of the bests. The default is False.

Returns
-------
Tuple(ndarray, ndarray)
The first array, of shape ``(n_matches)``, contains the distance between
the query and its best matches in X_. The second array, of shape
``(n_matches, 2)``, contains the indexes of these matches as
``(id_sample, id_timepoint)``. The corresponding match can be
retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``.

"""
# This whole function could be optimized and maybe made in numba to avoid stepping
# out of numba mode during distance computations

n_cases_ = len(distance_profiles)

id_timestamps = np.concatenate(
[np.arange(distance_profiles[i].shape[0]) for i in range(n_cases_)]
)
id_samples = np.concatenate(
[[i] * distance_profiles[i].shape[0] for i in range(n_cases_)]
)

distance_profiles = np.concatenate(distance_profiles)

if inverse_distance:
# To avoid div by 0 case
distance_profiles += 1e-8
distance_profiles[distance_profiles != np.inf] = (
1 / distance_profiles[distance_profiles != np.inf]
)

if threshold != np.inf:
distance_profiles[distance_profiles > threshold] = np.inf

_argsort_1d = np.argsort(distance_profiles)
_argsort = np.asarray(
[
[id_samples[_argsort_1d[i]], id_timestamps[_argsort_1d[i]]]
for i in range(len(_argsort_1d))
],
dtype=int,
)

if distance_profiles[distance_profiles <= threshold].shape[0] < k:
_k = distance_profiles[distance_profiles <= threshold].shape[0]
warnings.warn(
f"Only {_k} matches are bellow the threshold of {threshold}, while"
f" k={k}. The number of returned match will be {_k}.",
stacklevel=2,
)
elif _argsort.shape[0] < k:
_k = _argsort.shape[0]
warnings.warn(
f"The number of possible match is {_argsort.shape[0]}, but got"
f" k={k}. The number of returned match will be {_k}.",
stacklevel=2,
)
else:
_k = k

if exclusion_size is None:
return distance_profiles[_argsort_1d[:_k]], _argsort[:_k]
else:
# Apply exclusion zone to avoid neighboring matches
top_k = np.zeros((_k, 2), dtype=int)
top_k_dist = np.zeros((_k), dtype=float)

top_k[0] = _argsort[0, :]
top_k_dist[0] = distance_profiles[_argsort_1d[0]]

n_inserted = 1
i_current = 1

while n_inserted < _k and i_current < _argsort.shape[0]:
candidate_sample, candidate_timestamp = _argsort[i_current]

insert = True
is_from_same_sample = top_k[:, 0] == candidate_sample
if np.any(is_from_same_sample):
LB = candidate_timestamp >= (
top_k[is_from_same_sample, 1] - exclusion_size
)
UB = candidate_timestamp <= (
top_k[is_from_same_sample, 1] + exclusion_size
)
if np.any(UB & LB):
insert = False

if insert:
top_k[n_inserted] = _argsort[i_current]
top_k_dist[n_inserted] = distance_profiles[_argsort_1d[i_current]]
n_inserted += 1
i_current += 1
return top_k_dist[:n_inserted], top_k[:n_inserted]
Loading