Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docarray/array/mixins/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,10 @@ def find(
'filter and query cannot be both dict type, set only one for filtering'
)
elif query is None:
if isinstance(filter, dict):
if isinstance(filter, (str, dict)):
return self._filter(filter, limit=limit)
else:
raise ValueError('filter must be dict when query is None')
raise ValueError('filter must be dict or str when query is None')
elif isinstance(query, str) or (
isinstance(query, list) and isinstance(query[0], str)
):
Expand Down
1 change: 0 additions & 1 deletion docarray/array/storage/redis/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ class BackendMixin(BaseBackendMixin):
'float': TypeMap(type='float', converter=NumericField),
'double': TypeMap(type='double', converter=NumericField),
'long': TypeMap(type='long', converter=NumericField),
'bool': TypeMap(type='long', converter=NumericField),
}

def _init_storage(
Expand Down
27 changes: 21 additions & 6 deletions docarray/array/storage/redis/find.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, TypeVar, Union

import numpy as np
Expand Down Expand Up @@ -38,14 +39,13 @@ class FindMixin(BaseFindMixin):
def _find_similar_vectors(
self,
query: 'RedisArrayType',
filter: Optional[Dict] = None,
filter: Optional[Union[str, Dict]] = None,
limit: Union[int, float] = 20,
**kwargs,
):

if filter:
nodes = _build_query_nodes(filter)
query_str = intersect(*nodes).to_string()
query_str = _get_redis_filter_query(filter)
else:
query_str = '*'

Expand Down Expand Up @@ -90,11 +90,10 @@ def _find(

def _find_with_filter(
self,
filter: Dict,
filter: Union[str, Dict],
limit: Union[int, float] = 20,
):
nodes = _build_query_nodes(filter)
query_str = intersect(*nodes).to_string()
query_str = _get_redis_filter_query(filter)
q = Query(query_str)
q.paging(0, limit)

Expand Down Expand Up @@ -218,3 +217,19 @@ def _build_query_nodes(filter):
def _build_query_str(query):
query_str = '|'.join(query.split(' '))
return query_str


def _get_redis_filter_query(filter: Union[str, Dict]):
if isinstance(filter, dict):
warnings.warn(
"Dict syntax for redis filter will be deprecated, use string literals instead",
DeprecationWarning,
)
nodes = _build_query_nodes(filter)
query_str = intersect(*nodes).to_string()
elif isinstance(filter, str):
query_str = filter
else:
raise ValueError(f'Unexpected type of filter: {type(filter)}, expected str')

return query_str
194 changes: 96 additions & 98 deletions docs/advanced/document-store/redis.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,17 +113,41 @@ da2.summary()

Other functions behave the same as in-memory DocumentArray.

## Configuration

### Vector search with filter query
The following configs can be set:

| Name | Description | Default |
|-------------------|---------------------------------------------------------------------------------------------------|-------------------------------------------------- |
| `host` | Host address of the Redis server | `'localhost'` |
| `port` | Port of the Redis Server | `6379` |
| `redis_config` | Other Redis configs in a Dict and pass to `Redis` client constructor, e.g. `socket_timeout`, `ssl`| `{}` |
| `index_name` | Redis index name; the name of RedisSearch index to set this DocumentArray | `None` |
| `n_dim` | Dimensionality of the embeddings | `None` |
| `update_schema` | Boolean flag indicating whether to update Redis Search schema | `True` |
| `distance` | Similarity distance metric in Redis, one of {`'L2'`, `'IP'`, `'COSINE'`} | `'COSINE'` |
| `batch_size` | Batch size used to handle storage updates | `64` |
| `method` | Vector similarity index algorithm in Redis, either `FLAT` or `HNSW` | `'HNSW'` |
| `index_text` | Boolean flag indicating whether to index `.text`. `True` will enable full text search on `.text` | `None` |
| `tag_indices` | List of tags to index as text field | `[]` |
| `ef_construction` | Optional parameter for Redis HNSW algorithm | `200` |
| `m` | Optional parameter for Redis HNSW algorithm | `16` |
| `ef_runtime` | Optional parameter for Redis HNSW algorithm | `10` |
| `block_size` | Optional parameter for Redis FLAT algorithm | `1048576` |
| `initial_cap` | Optional parameter for Redis HNSW and FLAT algorithm | `None`, defaults to the default value in Redis |
| `columns` | Other fields to store in Document and build schema | `None` |

You can perform Vector Similarity Search based on [FLAT or HNSW algorithm](vector-search-index) and pre-filter results using a filter query that is based on [MongoDB's Query](https://www.mongodb.com/docs/manual/reference/operator/query/). The following tags filters can be combine with `$and` and `$or`:
You can check the default values in [the docarray source code](https://github.com/jina-ai/docarray/blob/main/docarray/array/storage/redis/backend.py).
For vector search configurations, default values are those of the database backend, which you can find in the [Redis documentation](https://redis.io/docs/stack/search/reference/vectors/).

- `$eq` - Equal to (number, string)
- `$ne` - Not equal to (number, string)
- `$gt` - Greater than (number)
- `$gte` - Greater than or equal to (number)
- `$lt` - Less than (number)
- `$lte` - Less than or equal to (number)
```{note}
We will support geo-filtering soon.
The benchmark test is on the way.
```

### Vector search with filter query

You can perform Vector Similarity Search based on [FLAT or HNSW algorithm](vector-search-index) and pre-filter results using [Redis' Search Query Syntax](https://redis.io/docs/stack/search/reference/query_syntax/).


Consider Documents with embeddings `[0, 0, 0]` up to `[9, 9, 9]` where the Document with embedding `[i, i, i]`
Expand All @@ -139,7 +163,7 @@ da = DocumentArray(
storage='redis',
config={
'n_dim': n_dim,
'columns': {'price': 'int', 'color': 'str', 'stock': 'bool'},
'columns': {'price': 'int', 'color': 'str', 'stock': 'int'},
'distance': 'L2',
},
)
Expand All @@ -150,7 +174,7 @@ with da:
Document(
id=f'{i}',
embedding=i * np.ones(n_dim),
tags={'price': i, 'color': 'blue', 'stock': i % 2 == 0},
tags={'price': i, 'color': 'blue', 'stock': int(i % 2 == 0)},
)
for i in range(10)
]
Expand All @@ -160,7 +184,7 @@ with da:
Document(
id=f'{i+10}',
embedding=i * np.ones(n_dim),
tags={'price': i, 'color': 'red', 'stock': i % 2 == 0},
tags={'price': i, 'color': 'red', 'stock': int(i % 2 == 0)},
)
for i in range(10)
]
Expand All @@ -176,22 +200,7 @@ for doc in da:
Consider the case where you want the nearest vectors to the embedding `[8., 8., 8.]`, with the restriction that prices, colors and stock must pass a filter. For example, let's consider that retrieved Documents must have a `price` value lower than or equal to `max_price`, have `color` equal to `blue` and have `stock` equal to `True`. We can encode this information in Redis using

```text
{
"price": {"$lte": max_price},
"color": {"$gt": color},
"stock": {"$eq": True},
}
```
or

```text
{
"$and": {
"price": {"$lte": max_price},
"color": {"$gt": color},
"stock": {"$eq": True},
}
}
@price:[-inf {max_price}] @color:{color} @stock:[1 1]
```

Then the search with the proposed filter can be used as follows:
Expand All @@ -203,11 +212,7 @@ n_limit = 5
np_query = np.ones(n_dim) * 8
print(f'\nQuery vector: \t{np_query}')

filter = {
"price": {"$lte": max_price},
"color": {"$eq": color},
"stock": {"$eq": True},
}
filter = f'@price:[-inf {max_price}] @color:{color} @stock:[1 1]'

results = da.find(np_query, filter=filter, limit=n_limit)

Expand All @@ -225,49 +230,73 @@ This will print:
```console
Embeddings Approximate Nearest Neighbours with "price" at most 7, "color" blue and "stock" True:

score=12, embedding=[6. 6. 6.], price=6, color=blue, stock=True
score=48, embedding=[4. 4. 4.], price=4, color=blue, stock=True
score=108, embedding=[2. 2. 2.], price=2, color=blue, stock=True
score=192, embedding=[0. 0. 0.], price=0, color=blue, stock=True
```
More example filter expresses
- A Nike shoes or price less than `100`

```JSON
{
"$or": {
"brand": {"$eq": "Nike"},
"price": {"$lt": 100}
}
}
score=12, embedding=[6. 6. 6.], price=6, color=blue, stock=1
score=48, embedding=[4. 4. 4.], price=4, color=blue, stock=1
score=108, embedding=[2. 2. 2.], price=2, color=blue, stock=1
score=192, embedding=[0. 0. 0.], price=0, color=blue, stock=1
```

- A Nike shoes **and** either price is less than `100` or color is `"blue"`
````{admonition} Note
:class: note
Note that Redis does not support Boolean types in attributes. Therefore, you need to configure your boolean field as
integer in `columns` configuration (`'field': 'int'`) and use a filter query that treats the field as an integer
(`@field: [1 1]`).
````

### Search by filter query

One can search with user-defined query filters using the `.find` method. Such queries follow the [Redis Search Query Syntax](https://redis.io/docs/stack/search/reference/query_syntax/).

Consider a case where you store Documents with a tag of `price` into Redis and you want to retrieve all Documents
with `price` less than or equal to some `max_price` value.

You can index such Documents as follows:

```JSON
{
"brand": {"$eq": "Nike"},
"$or": {
"price": {"$lt": 100},
"color": {"$eq": "blue"},
```python
from docarray import Document, DocumentArray

n_dim = 3
da = DocumentArray(
storage='redis',
config={
'n_dim': n_dim,
'columns': {'price': 'float'},
},
}
)

with da:
da.extend([Document(id=f'r{i}', tags={'price': i}) for i in range(10)])

print('\nIndexed Prices:\n')
for price in da[:, 'tags__price']:
print(f'\t price={price}')
```

Then you can retrieve all documents whose price is less than or equal to `max_price` by applying the following
filter:

```python
max_price = 3
n_limit = 4

filter = f'@price:[-inf {max_price}] '
results = da.find(filter=filter)

print('\n Returned examples that verify filter "price at most 3":\n')
for price in results[:, 'tags__price']:
print(f'\t price={price}')
```

- A Nike shoes **or** both price is less than `100` and color is `"blue"`

```JSON
{
"$or": {
"brand": {"$eq": "Nike"},
"$and": {
"price": {"$lt": 100},
"color": {"$eq": "blue"},
},
}
}
This would print

```
Returned examples that satisfy condition "price at most 3":

price=0
price=1
price=2
price=3
```

(vector-search-index)=
### Update Vector Search Indexing Schema
Expand Down Expand Up @@ -471,34 +500,3 @@ results = da.find('cheap', index='price')



## Configuration

The following configs can be set:

| Name | Description | Default |
|-------------------|---------------------------------------------------------------------------------------------------|-------------------------------------------------- |
| `host` | Host address of the Redis server | `'localhost'` |
| `port` | Port of the Redis Server | `6379` |
| `redis_config` | Other Redis configs in a Dict and pass to `Redis` client constructor, e.g. `socket_timeout`, `ssl`| `{}` |
| `index_name` | Redis index name; the name of RedisSearch index to set this DocumentArray | `None` |
| `n_dim` | Dimensionality of the embeddings | `None` |
| `update_schema` | Boolean flag indicating whether to update Redis Search schema | `True` |
| `distance` | Similarity distance metric in Redis, one of {`'L2'`, `'IP'`, `'COSINE'`} | `'COSINE'` |
| `batch_size` | Batch size used to handle storage updates | `64` |
| `method` | Vector similarity index algorithm in Redis, either `FLAT` or `HNSW` | `'HNSW'` |
| `index_text` | Boolean flag indicating whether to index `.text`. `True` will enable full text search on `.text` | `None` |
| `tag_indices` | List of tags to index as text field | `[]` |
| `ef_construction` | Optional parameter for Redis HNSW algorithm | `200` |
| `m` | Optional parameter for Redis HNSW algorithm | `16` |
| `ef_runtime` | Optional parameter for Redis HNSW algorithm | `10` |
| `block_size` | Optional parameter for Redis FLAT algorithm | `1048576` |
| `initial_cap` | Optional parameter for Redis HNSW and FLAT algorithm | `None`, defaults to the default value in Redis |
| `columns` | Other fields to store in Document and build schema | `None` |

You can check the default values in [the docarray source code](https://github.com/jina-ai/docarray/blob/main/docarray/array/storage/redis/backend.py)
For vector search configurations, default values are those of the database backend, you can find them in [redis documentation](https://redis.io/docs/stack/search/reference/vectors/)

```{note}
We will support geo-filtering soon.
The benchmark test is on the way.
```
4 changes: 3 additions & 1 deletion tests/unit/array/mixins/test_exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,7 @@ def test_embedding_ops_error():
with pytest.raises(ValueError, match='Did you forget to set'):
db.find(da)
da.embeddings = np.random.random([100, 256])
with pytest.raises(ValueError, match='filter must be dict when query is None'):
with pytest.raises(
ValueError, match='filter must be dict or str when query is None'
):
da.find(None)
Loading