From f02205ef4b6f6857ca2707224b25f6751169bc42 Mon Sep 17 00:00:00 2001 From: Gabriel Erzse Date: Tue, 14 May 2024 17:27:07 +0300 Subject: [PATCH 1/4] Support missing/empty/null values in search --- redis/commands/search/commands.py | 66 ++++++------ redis/commands/search/field.py | 39 ++++++- tests/test_search.py | 174 ++++++++++++++++++++++++------ 3 files changed, 211 insertions(+), 68 deletions(-) diff --git a/redis/commands/search/commands.py b/redis/commands/search/commands.py index 2df2b5a754..7071aa0cac 100644 --- a/redis/commands/search/commands.py +++ b/redis/commands/search/commands.py @@ -4,6 +4,8 @@ from redis.client import Pipeline from redis.utils import deprecated_function +from .field import Field +from .indexDefinition import IndexDefinition from ..helpers import get_protocol_version, parse_to_dict from ._util import to_string @@ -151,44 +153,42 @@ def batch_indexer(self, chunk_size=100): def create_index( self, - fields, - no_term_offsets=False, - no_field_flags=False, - stopwords=None, - definition=None, + fields: List[Field], + no_term_offsets: bool = False, + no_field_flags: bool = False, + stopwords: Optional[List[str]] = None, + definition: Optional[IndexDefinition] = None, max_text_fields=False, temporary=None, - no_highlight=False, - no_term_frequencies=False, - skip_initial_scan=False, + no_highlight: bool = False, + no_term_frequencies: bool = False, + skip_initial_scan: bool = False, ): """ - Create the search index. The index must not already exist. - - ### Parameters: - - - **fields**: a list of TextField or NumericField objects - - **no_term_offsets**: If true, we will not save term offsets in - the index - - **no_field_flags**: If true, we will not save field flags that - allow searching in specific fields - - **stopwords**: If not None, we create the index with this custom - stopword list. The list can be empty - - **max_text_fields**: If true, we will encode indexes as if there - were more than 32 text fields which allows you to add additional - fields (beyond 32). - - **temporary**: Create a lightweight temporary index which will - expire after the specified period of inactivity (in seconds). The - internal idle timer is reset whenever the index is searched or added to. - - **no_highlight**: If true, disabling highlighting support. - Also implied by no_term_offsets. - - **no_term_frequencies**: If true, we avoid saving the term frequencies - in the index. - - **skip_initial_scan**: If true, we do not scan and index. - - For more information see `FT.CREATE `_. - """ # noqa + Creates the search index. The index must not already exist. + + For more information, see https://redis.io/commands/ft.create/ + + Args: + fields: A list of Field objects. + no_term_offsets: If `true`, term offsets will not be saved in the index. + no_field_flags: If true, field flags that allow searching in specific fields + will not be saved. + stopwords: If provided, the index will be created with this custom stopword + list. The list can be empty. + definition: If provided, the index will be created with this custom index + definition. + max_text_fields: If true, indexes will be encoded as if there were more than + 32 text fields, allowing for additional fields beyond 32. + temporary: Creates a lightweight temporary index which will expire after the + specified period of inactivity. The internal idle timer is reset whenever + the index is searched or added to. + no_highlight: If true, disables highlighting support. Also implied by + no_term_offsets. + no_term_frequencies: If true, term frequencies will not be saved in the index. + skip_initial_scan: If true, the initial scan and indexing will be skipped. + """ args = [CREATE_CMD, self.index_name] if definition is not None: args += definition.args diff --git a/redis/commands/search/field.py b/redis/commands/search/field.py index f316ed9f14..80eafb0ac1 100644 --- a/redis/commands/search/field.py +++ b/redis/commands/search/field.py @@ -1,9 +1,12 @@ -from typing import List +from typing import List, Optional from redis import DataError class Field: + """ + A class representing a field in a document. + """ NUMERIC = "NUMERIC" TEXT = "TEXT" WEIGHT = "WEIGHT" @@ -14,6 +17,9 @@ class Field: NOINDEX = "NOINDEX" AS = "AS" GEOSHAPE = "GEOSHAPE" + IS_MISSING = "ISMISSING" + IS_EMPTY = "ISEMPTY" + IS_NULL = "ISNULL" def __init__( self, @@ -21,8 +27,30 @@ def __init__( args: List[str] = None, sortable: bool = False, no_index: bool = False, + is_missing: bool = False, + is_empty: bool = False, + is_null: bool = False, + null_flags: Optional[List[str]] = None, as_name: str = None, ): + """ + Create a new field object. + + Args: + name: The name of the field. + args: + sortable: If `True`, the field will be sortable. + no_index: If `True`, the field will not be indexed. + is_missing: If `True`, it will be possible to search for documents that have + this field missing. + is_empty: If `True`, it will be possible to search for documents that have + an empty value for this field. + is_null: If `True`, it will be possible to search for documents that have + a `null` value for this field. + null_flags: If provided, this list of custom flags will be assimilated to + the `null` value. + as_name: If provided, this alias will be used for the field. + """ if args is None: args = [] self.name = name @@ -34,6 +62,15 @@ def __init__( self.args_suffix.append(Field.SORTABLE) if no_index: self.args_suffix.append(Field.NOINDEX) + if is_missing: + self.args_suffix.append(Field.IS_MISSING) + if is_empty: + self.args_suffix.append(Field.IS_EMPTY) + if is_null: + self.args_suffix.append(Field.IS_NULL) + if null_flags: + self.args_suffix.append(len(null_flags)) + self.args_suffix += null_flags if no_index and not sortable: raise ValueError("Non-Sortable non-Indexable fields are ignored") diff --git a/tests/test_search.py b/tests/test_search.py index bfe204254c..c09006e319 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -838,10 +838,10 @@ def test_spell_check(client): res = client.ft().spellcheck("lorm", include="dict") assert len(res["lorm"]) == 3 assert ( - res["lorm"][0]["suggestion"], - res["lorm"][1]["suggestion"], - res["lorm"][2]["suggestion"], - ) == ("lorem", "lore", "lorm") + res["lorm"][0]["suggestion"], + res["lorm"][1]["suggestion"], + res["lorm"][2]["suggestion"], + ) == ("lorem", "lore", "lorm") assert (res["lorm"][0]["score"], res["lorm"][1]["score"]) == ("0.5", "0") # test spellcheck exclude @@ -869,9 +869,9 @@ def test_spell_check(client): assert "lore" in res["results"]["lorm"][1].keys() assert "lorm" in res["results"]["lorm"][2].keys() assert ( - res["results"]["lorm"][0]["lorem"], - res["results"]["lorm"][1]["lore"], - ) == (0.5, 0) + res["results"]["lorm"][0]["lorem"], + res["results"]["lorm"][1]["lore"], + ) == (0.5, 0) # test spellcheck exclude res = client.ft().spellcheck("lorm", exclude="dict") @@ -940,7 +940,8 @@ def test_scorer(client): client.hset( "doc2", mapping={ - "description": "Quick alice was beginning to get very tired of sitting by her quick sister on the bank, and of having nothing to do." # noqa + "description": "Quick alice was beginning to get very tired of sitting by her quick sister on the bank, and of having nothing to do." + # noqa }, ) @@ -992,12 +993,12 @@ def test_get(client): ) assert [ - ["f1", "some valid content dd2", "f2", "this is sample text f2"] - ] == client.ft().get("doc2") + ["f1", "some valid content dd2", "f2", "this is sample text f2"] + ] == client.ft().get("doc2") assert [ - ["f1", "some valid content dd1", "f2", "this is sample text f1"], - ["f1", "some valid content dd2", "f2", "this is sample text f2"], - ] == client.ft().get("doc1", "doc2") + ["f1", "some valid content dd1", "f2", "this is sample text f1"], + ["f1", "some valid content dd2", "f2", "this is sample text f2"], + ] == client.ft().get("doc1", "doc2") @pytest.mark.redismod @@ -1040,7 +1041,8 @@ def test_aggregations_groupby(client): "ai", mapping={ "title": "RedisAI", - "body": "RedisAI executes Deep Learning/Machine Learning models and managing their data.", # noqa + "body": "RedisAI executes Deep Learning/Machine Learning models and managing their data.", + # noqa "parent": "redis", "random_num": 3, }, @@ -1049,7 +1051,8 @@ def test_aggregations_groupby(client): "json", mapping={ "title": "RedisJson", - "body": "RedisJSON implements ECMA-404 The JSON Data Interchange Standard as a native data type.", # noqa + "body": "RedisJSON implements ECMA-404 The JSON Data Interchange Standard as a native data type.", + # noqa "parent": "redis", "random_num": 8, }, @@ -1462,25 +1465,25 @@ def test_index_definition(client): ) assert [ - "ON", - "JSON", - "PREFIX", - 2, - "hset:", - "henry", - "FILTER", - "@f1==32", - "LANGUAGE_FIELD", - "play", - "LANGUAGE", - "English", - "SCORE_FIELD", - "chapter", - "SCORE", - 0.5, - "PAYLOAD_FIELD", - "txt", - ] == definition.args + "ON", + "JSON", + "PREFIX", + 2, + "hset:", + "henry", + "FILTER", + "@f1==32", + "LANGUAGE_FIELD", + "play", + "LANGUAGE", + "English", + "SCORE_FIELD", + "chapter", + "SCORE", + 0.5, + "PAYLOAD_FIELD", + "txt", + ] == definition.args createIndex(client.ft(), num_docs=500, definition=definition) @@ -2284,3 +2287,106 @@ def test_geoshape(client: redis.Redis): assert result.docs[0]["id"] == "small" result = client.ft().search(q2, query_params=qp2) assert len(result.docs) == 2 + + +@pytest.mark.redismod +def test_search_missing_fields(client): + definition = IndexDefinition(prefix=["property:"], index_type=IndexType.HASH) + + fields = [ + TextField("title", sortable=True), + NumericField("price", is_missing=True), + TagField("features", is_missing=True), + GeoField("location", is_missing=True), + GeoShapeField("boundary", is_missing=True), + VectorField("image_embedding", "HNSW", + {"TYPE": "FLOAT32", "DIM": 2, "DISTANCE_METRIC": "L2"}, + is_missing=True), + TextField("description", is_missing=True), + ] + + client.ft().create_index(fields, definition=definition) + + client.hset("property:1", mapping={ + "title": "Luxury Villa in Malibu", + "price": "5000000", + "features": "pool,sea view,modern", + "location": "34.0259,-118.7798", + "boundary": "POLYGON((34.0259 -118.7798, 34.0260 -118.7799, 34.0261 -118.7797, 34.0259 -118.7798))", + "image_embedding": "0.5,0.8", + "description": "A stunning modern villa overlooking the Pacific Ocean." + }) + + # Missing title + client.hset("property:2", mapping={ + "price": "1500000", + "features": "garden,garage", + "location": "40.7128,-74.0060", + "boundary": "POLYGON((40.7127 -74.0061, 40.7129 -74.0062, 40.7130 -74.0060, 40.7128 -74.0060))", + "image_embedding": "0.2,0.3", + "description": "Cozy family home in the heart of New York City." + }) + + # Missing price + client.hset("property:3", mapping={ + "title": "Country House", + "features": "large garden,privacy", + "location": "51.5074,-0.1278", + "boundary": "POLYGON((51.5073 -0.1279, 51.5075 -0.1280, 51.5076 -0.1276, 51.5074 -0.1278))", + "image_embedding": "0.6,0.4", + "description": "Spacious country house with a large garden and lots of privacy." + }) + + # Missing features + client.hset("property:4", mapping={ + "title": "Downtown Flat", + "price": "850000", + "location": "48.8566,2.3522", + "boundary": "POLYGON((48.8565 2.3521, 48.8567 2.3523, 48.8568 2.3520, 48.8566 2.3522))", + "image_embedding": "0.1,0.9", + "description": "Modern flat in central Paris with easy access to metro." + }) + + # Missing location + client.hset("property:5", mapping={ + "title": "Beachfront Bungalow", + "price": "2900000", + "features": "beachfront,sun deck", + "boundary": "POLYGON((26.1224 -80.1373, 26.1225 -80.1374, 26.1226 -80.1372, 26.1224 -80.1373))", + "image_embedding": "0.7,0.2", + "description": "Beautiful bungalow right on the beach." + }) + + # Missing boundary + client.hset("property:6", mapping={ + "title": "Mountain Cabin", + "price": "600000", + "features": "mountain view,fireplace", + "location": "39.5501,-105.7821", + "image_embedding": "0.8,0.1", + "description": "Rustic cabin in the Rocky Mountains, perfect for a winter getaway." + }) + + # Missing image embedding + client.hset("property:7", mapping={ + "title": "Urban Studio", + "price": "1200000", + "features": "rooftop,open floor plan", + "location": "34.0522,-118.2437", + "boundary": "POLYGON((34.0521 -118.2438, 34.0523 -118.2439, 34.0524 -118.2436, 34.0522 -118.2437))", + "description": "Stylish studio in downtown Los Angeles with a spacious rooftop." + }) + + # Missing description + client.hset("property:8", mapping={ + "title": "Suburban Home", + "price": "800000", + "features": "quiet neighborhood,backyard", + "location": "37.7749,-122.4194", + "boundary": "POLYGON((37.7748 -122.4195, 37.7750 -122.4196, 37.7751 -122.4193, 37.7749 -122.4194))", + "image_embedding": "0.4,0.6" + }) + + q = Query("ismissing(@price)") + res = client.ft().search(q) + assert res is not None From 594a3c4b2f9e9a77a47c4caa149bb4e4a4432cd4 Mon Sep 17 00:00:00 2001 From: Gabriel Erzse Date: Wed, 12 Jun 2024 17:45:44 +0300 Subject: [PATCH 2/4] Update after latest master --- redis/commands/search/commands.py | 11 +- redis/commands/search/field.py | 33 +--- tests/test_search.py | 262 ++++++++++++++---------------- 3 files changed, 135 insertions(+), 171 deletions(-) diff --git a/redis/commands/search/commands.py b/redis/commands/search/commands.py index 7071aa0cac..065ffa3547 100644 --- a/redis/commands/search/commands.py +++ b/redis/commands/search/commands.py @@ -4,13 +4,13 @@ from redis.client import Pipeline from redis.utils import deprecated_function -from .field import Field -from .indexDefinition import IndexDefinition from ..helpers import get_protocol_version, parse_to_dict from ._util import to_string from .aggregation import AggregateRequest, AggregateResult, Cursor from .document import Document +from .field import Field +from .indexDefinition import IndexDefinition from .query import Query from .result import Result from .suggestion import SuggestionParser @@ -181,11 +181,12 @@ def create_index( max_text_fields: If true, indexes will be encoded as if there were more than 32 text fields, allowing for additional fields beyond 32. temporary: Creates a lightweight temporary index which will expire after the - specified period of inactivity. The internal idle timer is reset whenever - the index is searched or added to. + specified period of inactivity. The internal idle timer is reset + whenever the index is searched or added to. no_highlight: If true, disables highlighting support. Also implied by no_term_offsets. - no_term_frequencies: If true, term frequencies will not be saved in the index. + no_term_frequencies: If true, term frequencies will not be saved in the + index. skip_initial_scan: If true, the initial scan and indexing will be skipped. """ diff --git a/redis/commands/search/field.py b/redis/commands/search/field.py index 80eafb0ac1..64ff07b77c 100644 --- a/redis/commands/search/field.py +++ b/redis/commands/search/field.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List from redis import DataError @@ -7,6 +7,7 @@ class Field: """ A class representing a field in a document. """ + NUMERIC = "NUMERIC" TEXT = "TEXT" WEIGHT = "WEIGHT" @@ -17,9 +18,7 @@ class Field: NOINDEX = "NOINDEX" AS = "AS" GEOSHAPE = "GEOSHAPE" - IS_MISSING = "ISMISSING" - IS_EMPTY = "ISEMPTY" - IS_NULL = "ISNULL" + INDEX_MISSING = "INDEXMISSING" def __init__( self, @@ -27,10 +26,7 @@ def __init__( args: List[str] = None, sortable: bool = False, no_index: bool = False, - is_missing: bool = False, - is_empty: bool = False, - is_null: bool = False, - null_flags: Optional[List[str]] = None, + index_missing: bool = False, as_name: str = None, ): """ @@ -41,14 +37,8 @@ def __init__( args: sortable: If `True`, the field will be sortable. no_index: If `True`, the field will not be indexed. - is_missing: If `True`, it will be possible to search for documents that have - this field missing. - is_empty: If `True`, it will be possible to search for documents that have - an empty value for this field. - is_null: If `True`, it will be possible to search for documents that have - a `null` value for this field. - null_flags: If provided, this list of custom flags will be assimilated to - the `null` value. + index_missing: If `True`, it will be possible to search for documents that + have this field missing. as_name: If provided, this alias will be used for the field. """ if args is None: @@ -62,15 +52,8 @@ def __init__( self.args_suffix.append(Field.SORTABLE) if no_index: self.args_suffix.append(Field.NOINDEX) - if is_missing: - self.args_suffix.append(Field.IS_MISSING) - if is_empty: - self.args_suffix.append(Field.IS_EMPTY) - if is_null: - self.args_suffix.append(Field.IS_NULL) - if null_flags: - self.args_suffix.append(len(null_flags)) - self.args_suffix += null_flags + if index_missing: + self.args_suffix.append(Field.INDEX_MISSING) if no_index and not sortable: raise ValueError("Non-Sortable non-Indexable fields are ignored") diff --git a/tests/test_search.py b/tests/test_search.py index 9be0e378f6..be6322cdd5 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -838,10 +838,10 @@ def test_spell_check(client): res = client.ft().spellcheck("lorm", include="dict") assert len(res["lorm"]) == 3 assert ( - res["lorm"][0]["suggestion"], - res["lorm"][1]["suggestion"], - res["lorm"][2]["suggestion"], - ) == ("lorem", "lore", "lorm") + res["lorm"][0]["suggestion"], + res["lorm"][1]["suggestion"], + res["lorm"][2]["suggestion"], + ) == ("lorem", "lore", "lorm") assert (res["lorm"][0]["score"], res["lorm"][1]["score"]) == ("0.5", "0") # test spellcheck exclude @@ -869,9 +869,9 @@ def test_spell_check(client): assert "lore" in res["results"]["lorm"][1].keys() assert "lorm" in res["results"]["lorm"][2].keys() assert ( - res["results"]["lorm"][0]["lorem"], - res["results"]["lorm"][1]["lore"], - ) == (0.5, 0) + res["results"]["lorm"][0]["lorem"], + res["results"]["lorm"][1]["lore"], + ) == (0.5, 0) # test spellcheck exclude res = client.ft().spellcheck("lorm", exclude="dict") @@ -940,8 +940,7 @@ def test_scorer(client): client.hset( "doc2", mapping={ - "description": "Quick alice was beginning to get very tired of sitting by her quick sister on the bank, and of having nothing to do." - # noqa + "description": "Quick alice was beginning to get very tired of sitting by her quick sister on the bank, and of having nothing to do." # noqa }, ) @@ -993,12 +992,12 @@ def test_get(client): ) assert [ - ["f1", "some valid content dd2", "f2", "this is sample text f2"] - ] == client.ft().get("doc2") + ["f1", "some valid content dd2", "f2", "this is sample text f2"] + ] == client.ft().get("doc2") assert [ - ["f1", "some valid content dd1", "f2", "this is sample text f1"], - ["f1", "some valid content dd2", "f2", "this is sample text f2"], - ] == client.ft().get("doc1", "doc2") + ["f1", "some valid content dd1", "f2", "this is sample text f1"], + ["f1", "some valid content dd2", "f2", "this is sample text f2"], + ] == client.ft().get("doc1", "doc2") @pytest.mark.redismod @@ -1041,8 +1040,7 @@ def test_aggregations_groupby(client): "ai", mapping={ "title": "RedisAI", - "body": "RedisAI executes Deep Learning/Machine Learning models and managing their data.", - # noqa + "body": "RedisAI executes Deep Learning/Machine Learning models and managing their data.", # noqa "parent": "redis", "random_num": 3, }, @@ -1051,8 +1049,7 @@ def test_aggregations_groupby(client): "json", mapping={ "title": "RedisJson", - "body": "RedisJSON implements ECMA-404 The JSON Data Interchange Standard as a native data type.", - # noqa + "body": "RedisJSON implements ECMA-404 The JSON Data Interchange Standard as a native data type.", # noqa "parent": "redis", "random_num": 8, }, @@ -1463,25 +1460,25 @@ def test_index_definition(client): ) assert [ - "ON", - "JSON", - "PREFIX", - 2, - "hset:", - "henry", - "FILTER", - "@f1==32", - "LANGUAGE_FIELD", - "play", - "LANGUAGE", - "English", - "SCORE_FIELD", - "chapter", - "SCORE", - 0.5, - "PAYLOAD_FIELD", - "txt", - ] == definition.args + "ON", + "JSON", + "PREFIX", + 2, + "hset:", + "henry", + "FILTER", + "@f1==32", + "LANGUAGE_FIELD", + "play", + "LANGUAGE", + "English", + "SCORE_FIELD", + "chapter", + "SCORE", + 0.5, + "PAYLOAD_FIELD", + "txt", + ] == definition.args createIndex(client.ft(), num_docs=500, definition=definition) @@ -2108,7 +2105,7 @@ def test_geo_params(client): params_dict = {"lat": "34.95126", "lon": "29.69465", "radius": 1000, "units": "km"} q = Query("@g:[$lon $lat $radius $units]").dialect(2) res = client.ft().search(q, query_params=params_dict) - _assert_geosearch_result(client, res, ["doc1", "doc2", "doc3"]) + _assert_search_result(client, res, ["doc1", "doc2", "doc3"]) @pytest.mark.redismod @@ -2125,13 +2122,13 @@ def test_geoshapes_query_intersects_and_disjoint(client): Query("@g:[intersects $shape]").dialect(3), query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"}, ) - _assert_geosearch_result(client, intersection, ["doc_point2", "doc_polygon1"]) + _assert_search_result(client, intersection, ["doc_point2", "doc_polygon1"]) disjunction = client.ft().search( Query("@g:[disjoint $shape]").dialect(3), query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"}, ) - _assert_geosearch_result(client, disjunction, ["doc_point1", "doc_polygon2"]) + _assert_search_result(client, disjunction, ["doc_point1", "doc_polygon2"]) @pytest.mark.redismod @@ -2149,19 +2146,19 @@ def test_geoshapes_query_contains_and_within(client): Query("@g:[contains $shape]").dialect(3), query_params={"shape": "POINT(25 25)"}, ) - _assert_geosearch_result(client, contains_a, ["doc_polygon1"]) + _assert_search_result(client, contains_a, ["doc_polygon1"]) contains_b = client.ft().search( Query("@g:[contains $shape]").dialect(3), query_params={"shape": "POLYGON((24 24, 24 26, 25 25, 24 24))"}, ) - _assert_geosearch_result(client, contains_b, ["doc_polygon1"]) + _assert_search_result(client, contains_b, ["doc_polygon1"]) within = client.ft().search( Query("@g:[within $shape]").dialect(3), query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"}, ) - _assert_geosearch_result(client, within, ["doc_point2", "doc_polygon1"]) + _assert_search_result(client, within, ["doc_point2", "doc_polygon1"]) @pytest.mark.redismod @@ -2325,22 +2322,9 @@ def test_geoshape(client: redis.Redis): q2 = Query("@geom:[CONTAINS $poly]").dialect(3) qp2 = {"poly": "POLYGON((2 2, 2 50, 50 50, 50 2, 2 2))"} result = client.ft().search(q1, query_params=qp1) - _assert_geosearch_result(client, result, ["small"]) + _assert_search_result(client, result, ["small"]) result = client.ft().search(q2, query_params=qp2) - _assert_geosearch_result(client, result, ["small", "large"]) - - -def _assert_geosearch_result(client, result, expected_doc_ids): - """ - Make sure the result of a geo search is as expected, taking into account the RESP - version being used. - """ - if is_resp2_connection(client): - assert set([doc.id for doc in result.docs]) == set(expected_doc_ids) - assert result.total == len(expected_doc_ids) - else: - assert set([doc["id"] for doc in result["results"]]) == set(expected_doc_ids) - assert result["total_results"] == len(expected_doc_ids) + _assert_search_result(client, result, ["small", "large"]) @pytest.mark.redismod @@ -2349,98 +2333,94 @@ def test_search_missing_fields(client): fields = [ TextField("title", sortable=True), - NumericField("price", is_missing=True), - TagField("features", is_missing=True), - GeoField("location", is_missing=True), - GeoShapeField("boundary", is_missing=True), - VectorField("image_embedding", "HNSW", - {"TYPE": "FLOAT32", "DIM": 2, "DISTANCE_METRIC": "L2"}, - is_missing=True), - TextField("description", is_missing=True), + NumericField("price", index_missing=True), + TagField("features", index_missing=True), + GeoField("location", index_missing=True), ] client.ft().create_index(fields, definition=definition) - client.hset("property:1", mapping={ - "title": "Luxury Villa in Malibu", - "price": "5000000", - "features": "pool,sea view,modern", - "location": "34.0259,-118.7798", - "boundary": "POLYGON((34.0259 -118.7798, 34.0260 -118.7799, 34.0261 -118.7797, 34.0259 -118.7798))", - "image_embedding": "0.5,0.8", - "description": "A stunning modern villa overlooking the Pacific Ocean." - }) + # All fields present + client.hset( + "property:1", + mapping={ + "title": "Luxury Villa in Malibu", + "price": "5000000", + "features": "pool,sea view,modern", + "location": "34.0259,-118.7798", + }, + ) # Missing title - client.hset("property:2", mapping={ - "price": "1500000", - "features": "garden,garage", - "location": "40.7128,-74.0060", - "boundary": "POLYGON((40.7127 -74.0061, 40.7129 -74.0062, 40.7130 -74.0060, 40.7128 -74.0060))", - "image_embedding": "0.2,0.3", - "description": "Cozy family home in the heart of New York City." - }) + client.hset( + "property:2", + mapping={ + "price": "1500000", + "features": "garden,garage", + "location": "40.7128,-74.0060", + }, + ) # Missing price - client.hset("property:3", mapping={ - "title": "Country House", - "features": "large garden,privacy", - "location": "51.5074,-0.1278", - "boundary": "POLYGON((51.5073 -0.1279, 51.5075 -0.1280, 51.5076 -0.1276, 51.5074 -0.1278))", - "image_embedding": "0.6,0.4", - "description": "Spacious country house with a large garden and lots of privacy." - }) + client.hset( + "property:3", + mapping={ + "title": "Country House", + "features": "large garden,privacy", + "location": "51.5074,-0.1278", + }, + ) # Missing features - client.hset("property:4", mapping={ - "title": "Downtown Flat", - "price": "850000", - "location": "48.8566,2.3522", - "boundary": "POLYGON((48.8565 2.3521, 48.8567 2.3523, 48.8568 2.3520, 48.8566 2.3522))", - "image_embedding": "0.1,0.9", - "description": "Modern flat in central Paris with easy access to metro." - }) + client.hset( + "property:4", + mapping={ + "title": "Downtown Flat", + "price": "850000", + "location": "48.8566,2.3522", + }, + ) # Missing location - client.hset("property:5", mapping={ - "title": "Beachfront Bungalow", - "price": "2900000", - "features": "beachfront,sun deck", - "boundary": "POLYGON((26.1224 -80.1373, 26.1225 -80.1374, 26.1226 -80.1372, 26.1224 -80.1373))", - "image_embedding": "0.7,0.2", - "description": "Beautiful bungalow right on the beach." - }) - - # Missing boundary - client.hset("property:6", mapping={ - "title": "Mountain Cabin", - "price": "600000", - "features": "mountain view,fireplace", - "location": "39.5501,-105.7821", - "image_embedding": "0.8,0.1", - "description": "Rustic cabin in the Rocky Mountains, perfect for a winter getaway." - }) - - # Missing image embedding - client.hset("property:7", mapping={ - "title": "Urban Studio", - "price": "1200000", - "features": "rooftop,open floor plan", - "location": "34.0522,-118.2437", - "boundary": "POLYGON((34.0521 -118.2438, 34.0523 -118.2439, 34.0524 -118.2436, 34.0522 -118.2437))", - "description": "Stylish studio in downtown Los Angeles with a spacious rooftop." - }) - - # Missing description - client.hset("property:8", mapping={ - "title": "Suburban Home", - "price": "800000", - "features": "quiet neighborhood,backyard", - "location": "37.7749,-122.4194", - "boundary": "POLYGON((37.7748 -122.4195, 37.7750 -122.4196, 37.7751 -122.4193, 37.7749 -122.4194))", - "image_embedding": "0.4,0.6" - }) - - q = Query("ismissing(@price)") - res = client.ft().search(q) - assert res is not None + client.hset( + "property:5", + mapping={ + "title": "Beachfront Bungalow", + "price": "2900000", + "features": "beachfront,sun deck", + }, + ) + + with pytest.raises(redis.exceptions.ResponseError) as e: + client.ft().search( + Query("ismissing(@title)").dialect(5).return_field("id").no_content() + ) + assert "to be defined with 'INDEXMISSING'" in e.value.args[0] + + res = client.ft().search( + Query("ismissing(@price)").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:3"]) + + res = client.ft().search( + Query("ismissing(@features)").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:4"]) + + res = client.ft().search( + Query("ismissing(@location)").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:5"]) + + +def _assert_search_result(client, result, expected_doc_ids): + """ + Make sure the result of a geo search is as expected, taking into account the RESP + version being used. + """ + if is_resp2_connection(client): + assert set([doc.id for doc in result.docs]) == set(expected_doc_ids) + assert result.total == len(expected_doc_ids) + else: + assert set([doc["id"] for doc in result["results"]]) == set(expected_doc_ids) + assert result["total_results"] == len(expected_doc_ids) From 149f5f1b2d86a6e2ad21ebaccecb98f94f79659c Mon Sep 17 00:00:00 2001 From: Gabriel Erzse Date: Wed, 12 Jun 2024 17:49:47 +0300 Subject: [PATCH 3/4] Fix indent in docs --- redis/commands/search/commands.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/redis/commands/search/commands.py b/redis/commands/search/commands.py index 065ffa3547..764ad4c2b8 100644 --- a/redis/commands/search/commands.py +++ b/redis/commands/search/commands.py @@ -173,18 +173,18 @@ def create_index( fields: A list of Field objects. no_term_offsets: If `true`, term offsets will not be saved in the index. no_field_flags: If true, field flags that allow searching in specific fields - will not be saved. + will not be saved. stopwords: If provided, the index will be created with this custom stopword - list. The list can be empty. + list. The list can be empty. definition: If provided, the index will be created with this custom index - definition. + definition. max_text_fields: If true, indexes will be encoded as if there were more than - 32 text fields, allowing for additional fields beyond 32. + 32 text fields, allowing for additional fields beyond 32. temporary: Creates a lightweight temporary index which will expire after the - specified period of inactivity. The internal idle timer is reset - whenever the index is searched or added to. + specified period of inactivity. The internal idle timer is reset + whenever the index is searched or added to. no_highlight: If true, disables highlighting support. Also implied by - no_term_offsets. + `no_term_offsets`. no_term_frequencies: If true, term frequencies will not be saved in the index. skip_initial_scan: If true, the initial scan and indexing will be skipped. From 1d92d7136d15f7a215ee88c0e98184e0d902250a Mon Sep 17 00:00:00 2001 From: Gabriel Erzse Date: Thu, 13 Jun 2024 09:08:23 +0300 Subject: [PATCH 4/4] Support indexing empty fields too --- redis/commands/search/field.py | 6 ++ tests/test_search.py | 106 ++++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 29 deletions(-) diff --git a/redis/commands/search/field.py b/redis/commands/search/field.py index 64ff07b77c..8af7777f19 100644 --- a/redis/commands/search/field.py +++ b/redis/commands/search/field.py @@ -19,6 +19,7 @@ class Field: AS = "AS" GEOSHAPE = "GEOSHAPE" INDEX_MISSING = "INDEXMISSING" + INDEX_EMPTY = "INDEXEMPTY" def __init__( self, @@ -27,6 +28,7 @@ def __init__( sortable: bool = False, no_index: bool = False, index_missing: bool = False, + index_empty: bool = False, as_name: str = None, ): """ @@ -39,6 +41,8 @@ def __init__( no_index: If `True`, the field will not be indexed. index_missing: If `True`, it will be possible to search for documents that have this field missing. + index_empty: If `True`, it will be possible to search for documents that + have this field empty. as_name: If provided, this alias will be used for the field. """ if args is None: @@ -54,6 +58,8 @@ def __init__( self.args_suffix.append(Field.NOINDEX) if index_missing: self.args_suffix.append(Field.INDEX_MISSING) + if index_empty: + self.args_suffix.append(Field.INDEX_EMPTY) if no_index and not sortable: raise ValueError("Non-Sortable non-Indexable fields are ignored") diff --git a/tests/test_search.py b/tests/test_search.py index be6322cdd5..cceb12a547 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -2333,9 +2333,8 @@ def test_search_missing_fields(client): fields = [ TextField("title", sortable=True), - NumericField("price", index_missing=True), TagField("features", index_missing=True), - GeoField("location", index_missing=True), + TextField("description", index_missing=True), ] client.ft().create_index(fields, definition=definition) @@ -2345,72 +2344,123 @@ def test_search_missing_fields(client): "property:1", mapping={ "title": "Luxury Villa in Malibu", - "price": "5000000", "features": "pool,sea view,modern", - "location": "34.0259,-118.7798", + "description": "A stunning modern villa overlooking the Pacific Ocean.", }, ) - # Missing title + # Missing features client.hset( "property:2", mapping={ - "price": "1500000", - "features": "garden,garage", - "location": "40.7128,-74.0060", + "title": "Downtown Flat", + "description": "Modern flat in central Paris with easy access to metro.", }, ) - # Missing price + # Missing description client.hset( "property:3", mapping={ - "title": "Country House", - "features": "large garden,privacy", - "location": "51.5074,-0.1278", + "title": "Beachfront Bungalow", + "features": "beachfront,sun deck", }, ) - # Missing features + with pytest.raises(redis.exceptions.ResponseError) as e: + client.ft().search( + Query("ismissing(@title)").dialect(5).return_field("id").no_content() + ) + assert "to be defined with 'INDEXMISSING'" in e.value.args[0] + + res = client.ft().search( + Query("ismissing(@features)").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:2"]) + + res = client.ft().search( + Query("-ismissing(@features)").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:1", "property:3"]) + + res = client.ft().search( + Query("ismissing(@description)").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:3"]) + + res = client.ft().search( + Query("-ismissing(@description)").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:1", "property:2"]) + + +@pytest.mark.redismod +def test_search_empty_fields(client): + definition = IndexDefinition(prefix=["property:"], index_type=IndexType.HASH) + + fields = [ + TextField("title", sortable=True), + TagField("features", index_empty=True), + TextField("description", index_empty=True), + ] + + client.ft().create_index(fields, definition=definition) + + # All fields present client.hset( - "property:4", + "property:1", + mapping={ + "title": "Luxury Villa in Malibu", + "features": "pool,sea view,modern", + "description": "A stunning modern villa overlooking the Pacific Ocean.", + }, + ) + + # Empty features + client.hset( + "property:2", mapping={ "title": "Downtown Flat", - "price": "850000", - "location": "48.8566,2.3522", + "features": "", + "description": "Modern flat in central Paris with easy access to metro.", }, ) - # Missing location + # Empty description client.hset( - "property:5", + "property:3", mapping={ "title": "Beachfront Bungalow", - "price": "2900000", "features": "beachfront,sun deck", + "description": "", }, ) with pytest.raises(redis.exceptions.ResponseError) as e: client.ft().search( - Query("ismissing(@title)").dialect(5).return_field("id").no_content() + Query("@title:''").dialect(5).return_field("id").no_content() ) - assert "to be defined with 'INDEXMISSING'" in e.value.args[0] + assert "to be defined with `INDEXEMPTY`" in e.value.args[0] res = client.ft().search( - Query("ismissing(@price)").dialect(5).return_field("id").no_content() + Query("@features:{ }").dialect(5).return_field("id").no_content() ) - _assert_search_result(client, res, ["property:3"]) + _assert_search_result(client, res, ["property:2"]) res = client.ft().search( - Query("ismissing(@features)").dialect(5).return_field("id").no_content() + Query("-@features:{ }").dialect(5).return_field("id").no_content() + ) + _assert_search_result(client, res, ["property:1", "property:3"]) + + res = client.ft().search( + Query("@description:''").dialect(5).return_field("id").no_content() ) - _assert_search_result(client, res, ["property:4"]) + _assert_search_result(client, res, ["property:3"]) res = client.ft().search( - Query("ismissing(@location)").dialect(5).return_field("id").no_content() + Query("-@description:''").dialect(5).return_field("id").no_content() ) - _assert_search_result(client, res, ["property:5"]) + _assert_search_result(client, res, ["property:1", "property:2"]) def _assert_search_result(client, result, expected_doc_ids): @@ -2420,7 +2470,5 @@ def _assert_search_result(client, result, expected_doc_ids): """ if is_resp2_connection(client): assert set([doc.id for doc in result.docs]) == set(expected_doc_ids) - assert result.total == len(expected_doc_ids) else: assert set([doc["id"] for doc in result["results"]]) == set(expected_doc_ids) - assert result["total_results"] == len(expected_doc_ids)