Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions docarray/index/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,6 @@ def build(self, *args, **kwargs) -> Any:
@dataclass
class DBConfig(ABC):
index_name: Optional[str] = None

@dataclass
class RuntimeConfig(ABC):
# default configurations for every column type
# a dictionary from a column type (DB specific) to a dictionary
# of default configurations for that type
Expand All @@ -156,6 +153,10 @@ class RuntimeConfig(ABC):
# Example: `default_column_config['VARCHAR'] = {'length': 255}`
default_column_config: Dict[Type, Dict[str, Any]] = field(default_factory=dict)

@dataclass
class RuntimeConfig(ABC):
pass

@property
def index_name(self):
"""Return the name of the index in the database."""
Expand Down Expand Up @@ -896,14 +897,14 @@ def _create_single_column(self, field: 'ModelField', type_: Type) -> _ColumnInfo
if 'col_type' in custom_config.keys():
db_type = custom_config['col_type']
custom_config.pop('col_type')
if db_type not in self._runtime_config.default_column_config.keys():
if db_type not in self._db_config.default_column_config.keys():
raise ValueError(
f'The given col_type is not a valid db type: {db_type}'
)
else:
db_type = self.python_type_to_db_type(type_)

config = self._runtime_config.default_column_config[db_type].copy()
config = self._db_config.default_column_config[db_type].copy()
config.update(custom_config)
# parse n_dim from parametrized tensor type
if (
Expand Down
16 changes: 7 additions & 9 deletions docarray/index/backends/elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@

ELASTIC_PY_VEC_TYPES: List[Any] = [list, tuple, np.ndarray, AbstractTensor]


if TYPE_CHECKING:
import tensorflow as tf # type: ignore
import torch
Expand All @@ -57,7 +56,6 @@
torch = import_library('torch', raise_error=False)
tf = import_library('tensorflow', raise_error=False)


if torch is not None:
ELASTIC_PY_VEC_TYPES.append(torch.Tensor)

Expand Down Expand Up @@ -254,13 +252,7 @@ class DBConfig(BaseDocIndex.DBConfig):
es_config: Dict[str, Any] = field(default_factory=dict)
index_settings: Dict[str, Any] = field(default_factory=dict)
index_mappings: Dict[str, Any] = field(default_factory=dict)

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of ElasticDocIndex."""

default_column_config: Dict[Any, Dict[str, Any]] = field(default_factory=dict)
chunk_size: int = 500

def __post_init__(self):
self.default_column_config = {
Expand Down Expand Up @@ -323,6 +315,12 @@ def dense_vector_config(self):

return config

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of ElasticDocIndex."""

chunk_size: int = 500

###############################################
# Implementation of abstract methods #
###############################################
Expand Down Expand Up @@ -624,7 +622,7 @@ def _form_search_body(
num_candidates: Optional[int] = None,
) -> Dict[str, Any]:
if not num_candidates:
num_candidates = self._runtime_config.default_column_config['dense_vector'][
num_candidates = self._db_config.default_column_config['dense_vector'][
'num_candidates'
]
body = {
Expand Down
6 changes: 4 additions & 2 deletions docarray/index/backends/elasticv7.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,14 @@ class DBConfig(ElasticDocIndex.DBConfig):

hosts: Union[str, List[str], None] = 'http://localhost:9200' # type: ignore

def dense_vector_config(self):
return {'dims': 128}

@dataclass
class RuntimeConfig(ElasticDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of ElasticDocIndex."""

def dense_vector_config(self):
return {'dims': 128}
pass

###############################################
# Implementation of abstract methods #
Expand Down
11 changes: 6 additions & 5 deletions docarray/index/backends/hnswlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,6 @@ class DBConfig(BaseDocIndex.DBConfig):
"""Dataclass that contains all "static" configurations of HnswDocumentIndex."""

work_dir: str = '.'

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of HnswDocumentIndex."""

default_column_config: Dict[Type, Dict[str, Any]] = field(
default_factory=lambda: {
np.ndarray: {
Expand All @@ -188,6 +183,12 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig):
}
)

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of HnswDocumentIndex."""

pass
Comment thread
JoanFM marked this conversation as resolved.

###############################################
# Implementation of abstract methods #
###############################################
Expand Down
12 changes: 6 additions & 6 deletions docarray/index/backends/in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,6 @@ def build(self, *args, **kwargs) -> Any:
class DBConfig(BaseDocIndex.DBConfig):
"""Dataclass that contains all "static" configurations of InMemoryExactNNIndex."""

pass

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of InMemoryExactNNIndex."""

default_column_config: Dict[Type, Dict[str, Any]] = field(
default_factory=lambda: defaultdict(
dict,
Expand All @@ -152,6 +146,12 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig):
)
)

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of InMemoryExactNNIndex."""

pass

def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
"""index Documents into the index.

Expand Down
11 changes: 6 additions & 5 deletions docarray/index/backends/qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,11 +241,6 @@ class DBConfig(BaseDocIndex.DBConfig):
optimizers_config: Optional[types.OptimizersConfigDiff] = None
wal_config: Optional[types.WalConfigDiff] = None
quantization_config: Optional[types.QuantizationConfig] = None

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of QdrantDocumentIndex."""

default_column_config: Dict[Type, Dict[str, Any]] = field(
default_factory=lambda: {
'id': {}, # type: ignore[dict-item]
Expand All @@ -254,6 +249,12 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig):
}
)

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of QdrantDocumentIndex."""

pass

def python_type_to_db_type(self, python_type: Type) -> Any:
"""Map python type to database type.
Takes any python type and returns the corresponding database column type.
Expand Down
9 changes: 4 additions & 5 deletions docarray/index/backends/weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,6 @@ class DBConfig(BaseDocIndex.DBConfig):
scopes: List[str] = field(default_factory=lambda: ["offline_access"])
auth_api_key: Optional[str] = None
embedded_options: Optional[EmbeddedOptions] = None

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of WeaviateDocumentIndex."""

default_column_config: Dict[Any, Dict[str, Any]] = field(
default_factory=lambda: {
np.ndarray: {},
Expand All @@ -264,6 +259,10 @@ class RuntimeConfig(BaseDocIndex.RuntimeConfig):
}
)

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
"""Dataclass that contains all "dynamic" configurations of WeaviateDocumentIndex."""

batch_config: Dict[str, Any] = field(
default_factory=lambda: DEFAULT_BATCH_CONFIG
)
Expand Down
25 changes: 13 additions & 12 deletions docs/how_to/add_doc_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,12 @@ To define what can be stored in them, and what the default values are, you need
```python
@dataclass
class DBConfig(BaseDocIndex.DBConfig):
...
default_column_config: Dict[Type, Dict[str, Any]] = ...


@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
default_column_config: Dict[Type, Dict[str, Any]] = ...
...
```

!!! note
Expand All @@ -306,16 +306,8 @@ The `DBConfig` class defines the static configurations of your Document Index.
These are configurations that are tied to the database (or library) running in the background, such as `host`, `port`, etc.
Here you should put everything that the user cannot or should not change after initialization.

### The `RuntimeConfig` class

The `RuntimeConfig` class defines the dynamic configurations of your Document Index.
These are configurations that can be changed at runtime, for example default behaviours such as batch sizes, consistency levels, etc.

It is a common pattern to allow such parameters both in the `RuntimeConfig`, where they will act as global defaults, and
in specific methods (`index`, `find`, etc.), where they will act as local overrides.

!!! note
Every `RuntimeConfig` needs to contain a `default_column_config` field.
Every `DBConfig` needs to contain a `default_column_config` field.
This is a dictionary that, for each possible column type in your database, defines a default configuration for that column type.
This will automatically be passed to a `_ColumnInfo` whenever a user does not manually specify a configuration for that column.

Expand All @@ -327,6 +319,15 @@ and for `varchar` columns you could define a `max_length` configuration.

It is probably best to see this in action, so you should check out the `HnswDocumentIndex` implementation.

### The `RuntimeConfig` class

The `RuntimeConfig` class defines the dynamic configurations of your Document Index.
These are configurations that can be changed at runtime, for example default behaviours such as batch sizes, consistency levels, etc.

It is a common pattern to allow such parameters both in the `RuntimeConfig`, where they will act as global defaults, and
in specific methods (`index`, `find`, etc.), where they will act as local overrides.


## Implement abstract methods for indexing, searching, and deleting

After you've done the basic setup above, you can jump into the good stuff: implementing the actual indexing, searching, and deleting.
Expand Down Expand Up @@ -374,7 +375,7 @@ class MySchema(BaseDoc):

In this case, the `db_type` of `my_num` will be `'float64'` and the `db_type` of `my_text` will be `'varchar'`.
Additional information regarding the `col_type`, such as `max_len` for `varchar` will be stored in the `_ColumnsInfo.config`.
The given `col_type` has to be a valid `db_type`, meaning that has to be described in the index's `RuntimeConfig.default_column_config`.
The given `col_type` has to be a valid `db_type`, meaning that has to be described in the index's `DBConfig.default_column_config`.

### The `_index()` method

Expand Down
39 changes: 32 additions & 7 deletions docs/user_guide/storing/docindex.md
Original file line number Diff line number Diff line change
Expand Up @@ -445,26 +445,25 @@ You can customize every field in this configuration:

### Runtime configurations

_Runtime configurations_ are configurations that pertain to the entire database or table (as opposed to just a specific column),
and that you can dynamically change at runtime.
_Runtime configurations_ are configurations that relate to the way how an `instance` operates with respect to a specific
database.


This commonly includes:
- default batch size for batching operations
- default mapping from pythong types to database column types
- default consistency level for various database operations
- ...

For every backend, you can get the full list of configurations and their defaults:

```python
from docarray.index import HnswDocumentIndex
from docarray.index import ElasticDocIndex


runtime_config = HnswDocumentIndex.RuntimeConfig()
runtime_config = ElasticDocIndex.RuntimeConfig()
print(runtime_config)

# > HnswDocumentIndex.RuntimeConfig(default_column_config={<class 'numpy.ndarray'>: {'dim': -1, 'index': True, 'space': 'l2', 'max_elements': 1024, 'ef_construction': 200, 'ef': 10, 'M': 16, 'allow_replace_deleted': True, 'num_threads': 1}, None: {}})
# > ElasticDocIndex.RuntimeConfig(chunk_size=500)
```

As you can see, `HnswDocumentIndex.RuntimeConfig` is a dataclass that contains only one configuration:
Expand Down Expand Up @@ -559,7 +558,33 @@ The `HnswDocumentIndex` above contains two columns which are configured differen
- `tens` has a dimensionality of `100`, can take up to `12` elements, and uses the `cosine` similarity space
- `tens_two` has a dimensionality of `10`, and uses the `ip` similarity space, and an `M` hyperparameter of 4

All configurations that are not explicitly set will be taken from the `default_column_config` of the `RuntimeConfig`.
All configurations that are not explicitly set will be taken from the `default_column_config` of the `DBConfig`.
You can modify these defaults in the following way:

```python
import numpy as np
from pydantic import Field

from docarray import BaseDoc
from docarray.index import HnswDocumentIndex
from docarray.typing import NdArray


class Schema(BaseDoc):
tens: NdArray[100] = Field(max_elements=12, space='cosine')
tens_two: NdArray[10] = Field(M=4, space='ip')


# create a DBConfig for your Document Index
conf = HnswDocumentIndex.DBConfig(work_dir='/tmp/my_db')
# update the default max_elements for np.ndarray columns
conf.default_column_config.get(np.ndarray).update(max_elements=2048)
# create Document Index
# tens has a max_elements of 12, specified in the schema
# tens_two has a max_elements of 2048, specified by the default in the DBConfig
db = HnswDocumentIndex[Schema](conf)
```


For an explanation of the configurations that are tweaked in this example, see the `HnswDocumentIndex` [documentation](index_hnswlib.md).

Expand Down
19 changes: 10 additions & 9 deletions docs/user_guide/storing/index_elastic.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,19 +491,11 @@ The following configs can be set in `DBConfig`:
| `index_name` | Elasticsearch index name, the name of Elasticsearch index object | None. Data will be stored in an index named after the Document type used as schema. |
| `index_settings` | Other [index settings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/index-modules.html#index-modules-settings) in a Dict for creating the index | dict |
| `index_mappings` | Other [index mappings](https://www.elastic.co/guide/en/elasticsearch/reference/8.6/mapping.html) in a Dict for creating the index | dict |
| `default_column_config` | The default configurations for every column type. | dict |

You can pass any of the above as keyword arguments to the `__init__()` method or pass an entire configuration object.
See [here](docindex.md#configuration-options#customize-configurations) for more information.

### RuntimeConfig

The `RuntimeConfig` dataclass of `ElasticDocIndex` consists of `default_column_config` and `chunk_size`. You can change `chunk_size` for batch operations:

```python
doc_index = ElasticDocIndex[SimpleDoc]()
doc_index.configure(ElasticDocIndex.RuntimeConfig(chunk_size=1000))
```

`default_column_config` is the default configurations for every column type. Since there are many column types in Elasticsearch, you can also consider changing the column config when defining the schema.

```python
Expand All @@ -514,5 +506,14 @@ class SimpleDoc(BaseDoc):
doc_index = ElasticDocIndex[SimpleDoc]()
```

### RuntimeConfig

The `RuntimeConfig` dataclass of `ElasticDocIndex` consists of `chunk_size`. You can change `chunk_size` for batch operations:

```python
doc_index = ElasticDocIndex[SimpleDoc]()
doc_index.configure(ElasticDocIndex.RuntimeConfig(chunk_size=1000))
```

You can pass the above as keyword arguments to the `configure()` method or pass an entire configuration object.
See [here](docindex.md#configuration-options#customize-configurations) for more information.
Loading