Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 20 additions & 12 deletions docarray/index/backends/in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,38 +48,38 @@ class InMemoryExactNNIndex(BaseDocIndex, Generic[TSchema]):
def __init__(
self,
docs: Optional[DocList] = None,
index_file_path: Optional[str] = None,
db_config=None,
**kwargs,
):
"""Initialize InMemoryExactNNIndex"""
if 'db_config' in kwargs:
kwargs.pop('db_config')
super().__init__(db_config=None, **kwargs)
super().__init__(db_config=db_config, **kwargs)
self._runtime_config = self.RuntimeConfig()
self._db_config = cast(InMemoryExactNNIndex.DBConfig, self._db_config)
self._index_file_path = self._db_config.index_file_path

if docs and index_file_path:
if docs and self._index_file_path:
raise ValueError(
'Initialize `InMemoryExactNNIndex` with either `docs` or '
'`index_file_path`, not both. Provide `docs` for a fresh index, or '
'`index_file_path` to use an existing file.'
)

if index_file_path:
if os.path.exists(index_file_path):
if self._index_file_path:
if os.path.exists(self._index_file_path):
self._logger.info(
f'Loading index from a binary file: {index_file_path}'
f'Loading index from a binary file: {self._index_file_path}'
)
self._docs = DocList.__class_getitem__(
cast(Type[BaseDoc], self._schema)
).load_binary(file=index_file_path)
).load_binary(file=self._index_file_path)

data_by_columns = self._get_col_value_dict(self._docs)
self._update_subindex_data(self._docs)
self._index_subindex(data_by_columns)

else:
self._logger.warning(
f'Index file does not exist: {index_file_path}. '
f'Index file does not exist: {self._index_file_path}. '
f'Initializing empty InMemoryExactNNIndex.'
)
self._docs = DocList.__class_getitem__(
Expand Down Expand Up @@ -137,6 +137,7 @@ def build(self, *args, **kwargs) -> Any:
class DBConfig(BaseDocIndex.DBConfig):
"""Dataclass that contains all "static" configurations of InMemoryExactNNIndex."""

index_file_path: Optional[str] = None
default_column_config: Dict[Type, Dict[str, Any]] = field(
default_factory=lambda: defaultdict(
dict,
Expand Down Expand Up @@ -442,9 +443,16 @@ def __contains__(self, item: BaseDoc):
f"item must be an instance of BaseDoc or its subclass, not '{type(item).__name__}'"
)

def persist(self, file: str = 'in_memory_index.bin') -> None:
def persist(self, file: Optional[str] = None) -> None:
"""Persist InMemoryExactNNIndex into a binary file."""
self._docs.save_binary(file=file)
DEFAULT_INDEX_FILE_PATH = 'in_memory_index.bin'
Comment thread
JoanFM marked this conversation as resolved.
file_to_save = self._index_file_path or file
if file_to_save is None:
self._logger.warning(
f'persisting index to {DEFAULT_INDEX_FILE_PATH} because no `index_file_path` has been used inside DBConfig and no `file` has been passed as argument'
)
file_to_save = file_to_save or DEFAULT_INDEX_FILE_PATH
self._docs.save_binary(file=file_to_save)

def _get_root_doc_id(self, id: str, root: str, sub: str) -> str:
"""Get the root_id given the id of a subindex Document and the root and subindex name
Expand Down
30 changes: 19 additions & 11 deletions docs/user_guide/storing/index_in_memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,20 @@ docs = DocList[MyDoc](MyDoc() for _ in range(10))
doc_index = InMemoryExactNNIndex[MyDoc]()
doc_index.index(docs)

# or in one step:
# or in one step, create with inserted docs.
doc_index = InMemoryExactNNIndex[MyDoc](docs)
```

Additionally, you can preserve your index as a binary file and instantiate a new one using this file:
Alternatively, you can pass an `index_file_path` argument to make sure that the index can be restored if persisted from that specific file.
```python
# Save your existing index as a binary file
doc_index.persist('docs.bin')
docs = DocList[MyDoc](MyDoc() for _ in range(10))

doc_index = InMemoryExactNNIndex[MyDoc](index_file_path='docs.bin')
doc_index.index(docs)

# or in one step:
Comment thread
JoanFM marked this conversation as resolved.
doc_index.persist()

# Initialize a new document index using the saved binary file
new_doc_index = InMemoryExactNNIndex[MyDoc](index_file_path='docs.bin')
Expand All @@ -46,21 +52,23 @@ new_doc_index = InMemoryExactNNIndex[MyDoc](index_file_path='docs.bin')

This section lays out the configurations and options that are specific to [InMemoryExactNNIndex][docarray.index.backends.in_memory.InMemoryExactNNIndex].

The `DBConfig` of [InMemoryExactNNIndex][docarray.index.backends.in_memory.InMemoryExactNNIndex] contains only one entry:
the default mapping from Python types to column configurations.
The `DBConfig` of [InMemoryExactNNIndex][docarray.index.backends.in_memory.InMemoryExactNNIndex] contains two entries:
`index_file_path` and `default_column_mapping`, the default mapping from Python types to column configurations.

You can see in the [section below](#field-wise-configurations) how to override configurations for specific fields.
If you want to set configurations globally, i.e. for all vector fields in your Documents, you can do that using `DBConfig` or passing it at `__init__`::

```python
from collections import defaultdict
from docarray.typing import AbstractTensor
new_doc_index = InMemoryExactNNIndex[MyDoc](default_column_config=defaultdict(
dict,
{
AbstractTensor: {'space': 'cosine_sim'},
},
))
new_doc_index = InMemoryExactNNIndex[MyDoc](
default_column_config=defaultdict(
dict,
{
AbstractTensor: {'space': 'cosine_sim'},
},
)
)
```

This will set the default configuration for all vector fields to the one specified in the example above.
Expand Down