docarray · AnneYang720 · May 11, 2023 · Apr 20, 2023 · Apr 20, 2023 · Apr 20, 2023
diff --git a/docarray/index/abstract.py b/docarray/index/abstract.py
diff --git a/docarray/index/backends/elastic.py b/docarray/index/backends/elastic.py
@@ -25,6 +25,7 @@
 
 import docarray.typing
 from docarray import BaseDoc
+from docarray.array.any_array import AnyDocArray
 from docarray.index.abstract import BaseDocIndex, _ColumnInfo, _raise_not_composable
 from docarray.typing import AnyTensor
 from docarray.typing.tensor.abstract_tensor import AbstractTensor
@@ -39,6 +40,8 @@
 
 
 if TYPE_CHECKING:
+    import tensorflow as tf  # type: ignore
+    import torch
     from elastic_transport import NodeConfig
     from elasticsearch import Elasticsearch
     from elasticsearch.helpers import parallel_bulk
@@ -90,6 +93,8 @@ def __init__(self, db_config=None, **kwargs):
         mappings.update(self._db_config.index_mappings)
 
         for col_name, col in self._column_infos.items():
+            if issubclass(col.docarray_type, AnyDocArray):
+                continue
             if col.db_type == 'dense_vector' and (
                 not col.n_dim and col.config['dims'] < 0
             ):
@@ -100,7 +105,6 @@ def __init__(self, db_config=None, **kwargs):
 
             mappings['properties'][col_name] = self._create_index_mapping(col)
 
-        # print(mappings['properties'])
         if self._client.indices.exists(index=self.index_name):
             self._client_put_mapping(mappings)
         else:
@@ -334,6 +338,8 @@ def _index(
         refresh: bool = True,
         chunk_size: Optional[int] = None,
     ):
+        self._index_subindex(column_to_data)
+
         data = self._transpose_col_value_dict(column_to_data)
         requests = []
 
@@ -343,6 +349,8 @@ def _index(
                 '_id': row['id'],
             }
             for col_name, col in self._column_infos.items():
+                if issubclass(col.docarray_type, AnyDocArray):
+                    continue
                 if col.db_type == 'dense_vector' and np.all(row[col_name] == 0):
                     row[col_name] = row[col_name] + 1.0e-9
                 if row[col_name] is None:
@@ -383,7 +391,7 @@ def _del_items(
 
         self._refresh(self.index_name)
 
-    def _get_items(self, doc_ids: Sequence[str]) -> Sequence[TSchema]:
+    def _get_items(self, doc_ids: Sequence[str]) -> Sequence[Dict[str, Any]]:
         accumulated_docs = []
         accumulated_docs_id_not_found = []
 
@@ -515,6 +523,13 @@ def _text_search_batched(
         )
         return _FindResultBatched(documents=list(das), scores=scores)
 
+    def _filter_by_parent_id(self, id: str) -> List[str]:
+        resp = self._client_search(
+            query={'term': {'parent_id': id}}, fields=['id'], _source=False
+        )
+        ids = [hit['fields']['id'][0] for hit in resp['hits']['hits']]
+        return ids
+
     ###############################################
     # Helpers                                     #
     ###############################################

diff --git a/docarray/index/backends/hnswlib.py b/docarray/index/backends/hnswlib.py
@@ -1,3 +1,4 @@
+import glob
 import hashlib
 import os
 import sqlite3
@@ -7,6 +8,7 @@
     TYPE_CHECKING,
     Any,
     Dict,
+    Generator,
     Generic,
     List,
     Optional,
@@ -21,6 +23,7 @@
 import numpy as np
 
 from docarray import BaseDoc, DocList
+from docarray.array.any_array import AnyDocArray
 from docarray.index.abstract import (
     BaseDocIndex,
     _ColumnInfo,
@@ -67,11 +70,16 @@
 class HnswDocumentIndex(BaseDocIndex, Generic[TSchema]):
     def __init__(self, db_config=None, **kwargs):
         """Initialize HnswDocumentIndex"""
+        if db_config is not None and getattr(db_config, 'index_name'):
+            db_config.work_dir = db_config.index_name.replace("__", "/")
+
         super().__init__(db_config=db_config, **kwargs)
         self._db_config = cast(HnswDocumentIndex.DBConfig, self._db_config)
         self._work_dir = self._db_config.work_dir
         self._logger.debug(f'Working directory set to {self._work_dir}')
-        load_existing = os.path.exists(self._work_dir) and os.listdir(self._work_dir)
+        load_existing = os.path.exists(self._work_dir) and glob.glob(
+            f'{self._work_dir}/*.bin'
+        )
         Path(self._work_dir).mkdir(parents=True, exist_ok=True)
 
         # HNSWLib setup
@@ -90,6 +98,8 @@ def __init__(self, db_config=None, **kwargs):
         }
         self._hnsw_indices = {}
         for col_name, col in self._column_infos.items():
+            if issubclass(col.docarray_type, AnyDocArray):
+                continue
             if not col.config:
                 # non-tensor type; don't create an index
                 continue
@@ -118,6 +128,17 @@ def __init__(self, db_config=None, **kwargs):
         self._sqlite_conn.commit()
         self._logger.info(f'{self.__class__.__name__} has been initialized')
 
+    @property
+    def index_name(self):
+        return self._db_config.work_dir  # type: ignore
+
+    @property
+    def out_schema(self) -> Type[BaseDoc]:
+        """Return the real schema of the index."""
+        if self._is_subindex:
+            return self._ori_schema
+        return cast(Type[BaseDoc], self._schema)
+
     ###############################################
     # Inner classes for query builder and configs #
     ###############################################
@@ -184,9 +205,23 @@ def python_type_to_db_type(self, python_type: Type) -> Any:
 
         return None  # all types allowed, but no db type needed
 
-    def _index(self, column_data_dic, **kwargs):
+    def _index(
+        self,
+        column_to_data: Dict[str, Generator[Any, None, None]],
+        docs_validated: Sequence[BaseDoc] = [],
+    ):
+        self._index_subindex(column_to_data)
+
         # not needed, we implement `index` directly
-        ...
+        hashed_ids = tuple(self._to_hashed_id(doc.id) for doc in docs_validated)
+        # indexing into HNSWLib and SQLite sequentially
+        # could be improved by processing in parallel
+        for col_name, index in self._hnsw_indices.items():
+            data = column_to_data[col_name]
+            data_np = [self._to_numpy(arr) for arr in data]
+            data_stacked = np.stack(data_np)
+            index.add_items(data_stacked, ids=hashed_ids)
+            index.save_index(self._hnsw_locations[col_name])
 
     def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
         """Index Documents into the index.
@@ -206,16 +241,10 @@ def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
         n_docs = 1 if isinstance(docs, BaseDoc) else len(docs)
         self._logger.debug(f'Indexing {n_docs} documents')
         docs_validated = self._validate_docs(docs)
+        self._update_subindex_data(docs_validated)
         data_by_columns = self._get_col_value_dict(docs_validated)
-        hashed_ids = tuple(self._to_hashed_id(doc.id) for doc in docs_validated)
-        # indexing into HNSWLib and SQLite sequentially
-        # could be improved by processing in parallel
-        for col_name, index in self._hnsw_indices.items():
-            data = data_by_columns[col_name]
-            data_np = [self._to_numpy(arr) for arr in data]
-            data_stacked = np.stack(data_np)
-            index.add_items(data_stacked, ids=hashed_ids)
-            index.save_index(self._hnsw_locations[col_name])
+
+        self._index(data_by_columns, docs_validated, **kwargs)
 
         self._send_docs_to_sqlite(docs_validated)
         self._sqlite_conn.commit()
@@ -312,6 +341,15 @@ def _text_search_batched(
 
     def _del_items(self, doc_ids: Sequence[str]):
         # delete from the indices
+        for field_name, type_, _ in self._flatten_schema(
+            cast(Type[BaseDoc], self._schema)
+        ):
+            if issubclass(type_, AnyDocArray):
+                for id in doc_ids:
+                    doc = self.__getitem__(id)
+                    sub_ids = [sub_doc.id for sub_doc in getattr(doc, field_name)]
+                    del self._subindices[field_name][sub_ids]
+
         try:
             for doc_id in doc_ids:
                 id_ = self._to_hashed_id(doc_id)
@@ -323,8 +361,15 @@ def _del_items(self, doc_ids: Sequence[str]):
         self._delete_docs_from_sqlite(doc_ids)
         self._sqlite_conn.commit()
 
-    def _get_items(self, doc_ids: Sequence[str]) -> Sequence[TSchema]:
-        out_docs = self._get_docs_sqlite_doc_id(doc_ids)
+    def _get_items(self, doc_ids: Sequence[str], out: bool = True) -> Sequence[TSchema]:
+        """Get Documents from the hnswlib index, by `id`.
+        If no document is found, a KeyError is raised.
+
+        :param doc_ids: ids to get from the Document index
+        :param out: return the documents in the original schema(True) or inner schema(False) for subindex
+        :return: Sequence of Documents, sorted corresponding to the order of `doc_ids`. Duplicate `doc_ids` can be omitted in the output.
+        """
+        out_docs = self._get_docs_sqlite_doc_id(doc_ids, out)
         if len(out_docs) == 0:
             raise KeyError(f'No document with id {doc_ids} found')
         return out_docs
@@ -391,7 +436,7 @@ def _send_docs_to_sqlite(self, docs: Sequence[BaseDoc]):
             ((id_, self._doc_to_bytes(doc)) for id_, doc in zip(ids, docs)),
         )
 
-    def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int]):
+    def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int], out: bool = True):
         for id_ in univ_ids:
             # I hope this protects from injection attacks
             # properly binding with '?' doesn't work for some reason
@@ -401,13 +446,17 @@ def _get_docs_sqlite_unsorted(self, univ_ids: Sequence[int]):
             'SELECT data FROM docs WHERE doc_id IN %s' % sql_id_list,
         )
         rows = self._sqlite_cursor.fetchall()
-        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))
-        return docs_cls([self._doc_from_bytes(row[0]) for row in rows])
+        schema = self.out_schema if out else self._schema
+        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], schema))
+        return docs_cls([self._doc_from_bytes(row[0], out) for row in rows])
 
-    def _get_docs_sqlite_doc_id(self, doc_ids: Sequence[str]) -> DocList[TSchema]:
+    def _get_docs_sqlite_doc_id(
+        self, doc_ids: Sequence[str], out: bool = True
+    ) -> DocList[TSchema]:
         hashed_ids = tuple(self._to_hashed_id(id_) for id_ in doc_ids)
-        docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids)
-        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))
+        docs_unsorted = self._get_docs_sqlite_unsorted(hashed_ids, out)
+        schema = self.out_schema if out else self._schema
+        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], schema))
         return docs_cls(sorted(docs_unsorted, key=lambda doc: doc_ids.index(doc.id)))
 
     def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocList:
@@ -416,7 +465,7 @@ def _get_docs_sqlite_hashed_id(self, hashed_ids: Sequence[int]) -> DocList:
         def _in_position(doc):
             return hashed_ids.index(self._to_hashed_id(doc.id))
 
-        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))
+        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self.out_schema))
         return docs_cls(sorted(docs_unsorted, key=_in_position))
 
     def _delete_docs_from_sqlite(self, doc_ids: Sequence[Union[str, int]]):
@@ -436,6 +485,32 @@ def _get_num_docs_sqlite(self) -> int:
     def _doc_to_bytes(self, doc: BaseDoc) -> bytes:
         return doc.to_protobuf().SerializeToString()
 
-    def _doc_from_bytes(self, data: bytes) -> BaseDoc:
-        schema_cls = cast(Type[BaseDoc], self._schema)
+    def _doc_from_bytes(self, data: bytes, out: bool = True) -> BaseDoc:
+        schema = self.out_schema if out else self._schema
+        schema_cls = cast(Type[BaseDoc], schema)
         return schema_cls.from_protobuf(DocProto.FromString(data))
+
+    def _get_root_doc_id(self, id: str, root: str, sub: str) -> str:
+        """Get the root_id given the id of a subindex Document and the root and subindex name for hnswlib.
+
+        :param id: id of the subindex Document
+        :param root: root index name
+        :param sub: subindex name
+        :return: the root_id of the Document
+        """
+        subindex = self._subindices[root]
+
+        if not sub:
+            sub_doc = subindex._get_items([id], out=False)  # type: ignore
+            parent_id = (
+                sub_doc[0]['parent_id']
+                if isinstance(sub_doc[0], dict)
+                else sub_doc[0].parent_id
+            )
+            return parent_id
+        else:
+            fields = sub.split('__')
+            cur_root_id = subindex._get_root_doc_id(
+                id, fields[0], '__'.join(fields[1:])
+            )
+            return self._get_root_doc_id(cur_root_id, root, '')
diff --git a/docarray/index/backends/qdrant.py b/docarray/index/backends/qdrant.py
@@ -21,6 +21,7 @@
 
 import docarray.typing.id
 from docarray import BaseDoc, DocList
+from docarray.array.any_array import AnyDocArray
 from docarray.index.abstract import (
     BaseDocIndex,
     _ColumnInfo,
@@ -65,6 +66,9 @@ class QdrantDocumentIndex(BaseDocIndex, Generic[TSchema]):
 
     def __init__(self, db_config=None, **kwargs):
         """Initialize QdrantDocumentIndex"""
+        if db_config is not None and getattr(db_config, 'index_name'):
+            db_config.collection_name = db_config.index_name
+
         super().__init__(db_config=db_config, **kwargs)
         self._db_config: QdrantDocumentIndex.DBConfig = cast(
             QdrantDocumentIndex.DBConfig, self._db_config
@@ -98,6 +102,10 @@ def collection_name(self):
 
         return self._db_config.collection_name or default_collection_name
 
+    @property
+    def index_name(self):
+        return self.collection_name
+
     @dataclass
     class Query:
         """Dataclass describing a query."""
@@ -264,11 +272,14 @@ def _initialize_collection(self):
         try:
             self._client.get_collection(self.collection_name)
         except (UnexpectedResponse, RpcError, ValueError):
-            vectors_config = {
-                column_name: self._to_qdrant_vector_params(column_info)
-                for column_name, column_info in self._column_infos.items()
-                if column_info.db_type == 'vector'
-            }
+            vectors_config = {}
+
+            for column_name, column_info in self._column_infos.items():
+                if column_info.db_type == 'vector':
+                    vectors_config[column_name] = self._to_qdrant_vector_params(
+                        column_info
+                    )
+
             self._client.create_collection(
                 collection_name=self.collection_name,
                 vectors_config=vectors_config,
@@ -288,6 +299,8 @@ def _initialize_collection(self):
             )
 
     def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]):
+        self._index_subindex(column_to_data)
+
         rows = self._transpose_col_value_dict(column_to_data)
         # TODO: add batching the documents to avoid timeouts
         points = [self._build_point_from_row(row) for row in rows]
@@ -332,7 +345,10 @@ def _get_items(
             with_payload=True,
             with_vectors=True,
         )
-        return [self._convert_to_doc(point) for point in response]
+        return sorted(
+            [self._convert_to_doc(point) for point in response],
+            key=lambda x: doc_ids.index(x['id']),
+        )
 
     def execute_query(self, query: Union[Query, RawQuery], *args, **kwargs) -> DocList:
         """
@@ -532,11 +548,29 @@ def _text_search_batched(
             ],
         )
 
+    def _filter_by_parent_id(self, id: str) -> Optional[List[str]]:
+        response, _ = self._client.scroll(
+            collection_name=self._db_config.collection_name,  # type: ignore
+            scroll_filter=rest.Filter(
+                must=[
+                    rest.FieldCondition(
+                        key='parent_id', match=rest.MatchValue(value=id)
+                    )
+                ]
+            ),
+            with_payload=rest.PayloadSelectorInclude(include=['id']),
+        )
+
+        ids = [point.payload['id'] for point in response]  # type: ignore
+        return ids
+
     def _build_point_from_row(self, row: Dict[str, Any]) -> rest.PointStruct:
         point_id = self._to_qdrant_id(row.get('id'))
         vectors: Dict[str, List[float]] = {}
         payload: Dict[str, Any] = {'__generated_vectors': []}
         for column_name, column_info in self._column_infos.items():
+            if issubclass(column_info.docarray_type, AnyDocArray):
+                continue
             if column_info.db_type in ['id', 'payload']:
                 payload[column_name] = row.get(column_name)
                 continue