Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docarray/index/backends/hnswlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@ def _create_docs_table(self):
def _send_docs_to_sqlite(self, docs: Sequence[BaseDoc]):
ids = (self._to_hashed_id(doc.id) for doc in docs)
self._sqlite_cursor.executemany(
'INSERT INTO docs VALUES (?, ?)',
'INSERT OR REPLACE INTO docs VALUES (?, ?)',
((id_, self._doc_to_bytes(doc)) for id_, doc in zip(ids, docs)),
)

Expand Down
54 changes: 53 additions & 1 deletion docs/user_guide/storing/index_hnswlib.md
Original file line number Diff line number Diff line change
Expand Up @@ -208,4 +208,56 @@ To delete nested data, you need to specify the `id`.
del doc_index[index_docs[6].id]
```

Check [here](docindex#nested-data-with-subindex) for nested data with subindex.
Check [here](../docindex#nested-data-with-subindex) for nested data with subindex.

### Update elements
In order to update a Document inside the index, you only need to reindex it with the updated attributes.

First lets create a schema for our Index
```python
import numpy as np
from docarray import BaseDoc, DocList
from docarray.typing import NdArray
from docarray.index import HnswDocumentIndex
class MyDoc(BaseDoc):
text: str
embedding: NdArray[128]
```
Now we can instantiate our Index and index some data.

```python
docs = DocList[MyDoc](

@jupyterjazz jupyterjazz Jun 1, 2023

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd show an example where we retrieve some data (find, filter..) and replace only those docs, otherwise I don't think this is a relatable example

[MyDoc(embedding=np.random.rand(10), text=f'I am the first version of Document {i}') for i in range(100)]
)
index = HnswDocumentIndex[MyDoc]()
index.index(docs)
assert index.num_docs() == 100
```

Now we can find relevant documents

```python
res = index.find(query=docs[0], search_field='tens', limit=100)
assert len(res.documents) == 100
for doc in res.documents:
assert 'I am the first version' in doc.text
```

and update all of the text of this documents and reindex them

```python
for i, doc in enumerate(docs):
doc.text = f'I am the second version of Document {i}'

index.index(docs)
assert index.num_docs() == 100
```

When we retrieve them again we can see that their text attribute has been updated accordingly

```python
res = index.find(query=docs[0], search_field='tens', limit=100)
assert len(res.documents) == 100
for doc in res.documents:
assert 'I am the second version' in doc.text
```
50 changes: 50 additions & 0 deletions tests/index/hnswlib/test_index_get_del.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,3 +362,53 @@ def test_num_docs(ten_simple_docs, tmp_path):

del index[more_docs[2].id, ten_simple_docs[7].id]
assert index.num_docs() == 10


def test_update_payload(tmp_path):
class TextSimpleDoc(SimpleDoc):
text: str = 'hey'

docs = DocList[TextSimpleDoc](
[TextSimpleDoc(tens=np.random.rand(10), text=f'hey {i}') for i in range(100)]
)
index = HnswDocumentIndex[TextSimpleDoc](work_dir=str(tmp_path))
index.index(docs)
assert index.num_docs() == 100

for doc in docs:
doc.text += '_changed'

index.index(docs)
assert index.num_docs() == 100

res = index.find(query=docs[0], search_field='tens', limit=100)
assert len(res.documents) == 100
for doc in res.documents:
assert '_changed' in doc.text


def test_update_embedding(tmp_path):
class TextSimpleDoc(SimpleDoc):
text: str = 'hey'

docs = DocList[TextSimpleDoc](
[TextSimpleDoc(tens=np.random.rand(10), text=f'hey {i}') for i in range(100)]
)
index = HnswDocumentIndex[TextSimpleDoc](work_dir=str(tmp_path))
index.index(docs)
assert index.num_docs() == 100

new_tensor = np.random.rand(10)
docs[0].tens = new_tensor

index.index(docs[0])
assert index.num_docs() == 100

res = index.find(query=docs[0], search_field='tens', limit=100)
assert len(res.documents) == 100
found = False
for doc in res.documents:
if doc.id == docs[0].id:
found = True
assert (doc.tens == new_tensor).all()
assert found