docs: add utils section (#1307)

* feat: add utils for map to docs and fix docstring Signed-off-by: samsja <sami.jaghouar@hotmail.fr> * feat: add utils for map to docs and fix docstring Signed-off-by: samsja <sami.jaghouar@hotmail.fr> * feat: add utils for find and fix docstring Signed-off-by: samsja <sami.jaghouar@hotmail.fr> * fix: fix video ndaray docstrng Signed-off-by: samsja <sami.jaghouar@hotmail.fr> * fix: fix video find docstrng Signed-off-by: samsja <sami.jaghouar@hotmail.fr> * fix: fix map docstring Signed-off-by: samsja <sami.jaghouar@hotmail.fr> * fix: fix fileter docstring Signed-off-by: samsja <sami.jaghouar@hotmail.fr> * fix: fix add reduce Signed-off-by: samsja <sami.jaghouar@hotmail.fr> --------- Signed-off-by: samsja <sami.jaghouar@hotmail.fr> Signed-off-by: anna-charlotte <charlotte.gerhaher@jina.ai>
docarray · samsja · Mar 30, 2023 · Mar 23, 2023 · Mar 23, 2023 · Mar 24, 2023
commit 550981de89c9be049b6215c13b91d002e87ca08f
diff --git a/docarray/typing/tensor/video/video_ndarray.py b/docarray/typing/tensor/video/video_ndarray.py
@@ -42,15 +42,15 @@ class MyVideoDoc(BaseDoc):
         video_tensor=np.random.random((100, 224, 224, 3)),
     )
 
-    doc_1.video_tensor.save(file_path='file_1.mp4')
+    doc_1.video_tensor.save(file_path='/tmp/file_1.mp4')
 
     doc_2 = MyVideoDoc(
         title='my_second_video_doc',
-        url='file_1.mp4',
+        url='/tmp/file_1.mp4',
     )
 
     doc_2.video_tensor = parse_obj_as(VideoNdArray, doc_2.url.load().video)
-    doc_2.video_tensor.save(file_path='file_2.mp4')
+    doc_2.video_tensor.save(file_path='/tmp/file_2.mp4')
     ```
 
     ---

diff --git a/docarray/utils/filter.py b/docarray/utils/filter.py
@@ -1,3 +1,5 @@
+__all__ = ['filter_docs']
+
 import json
 from typing import Dict, List, Union
 
@@ -13,50 +15,55 @@ def filter_docs(
     Filter the Documents in the index according to the given filter query.
 
 
-    EXAMPLE USAGE
 
-    .. code-block:: python
+    ---
 
-        from docarray import DocArray, BaseDoc
-        from docarray.documents import Text, Image
-        from docarray.util.filter import filter_docs
+    ```python
+    from docarray import DocArray, BaseDoc
+    from docarray.documents import TextDoc, ImageDoc
+    from docarray.utils.filter import filter_docs
 
 
-        class MyDocument(BaseDoc):
-            caption: Text
-            image: Image
-            price: int
+    class MyDocument(BaseDoc):
+        caption: TextDoc
+        ImageDoc: ImageDoc
+        price: int
 
 
-        docs = DocArray[MyDocument](
-            [
-                MyDocument(
-                    caption='A tiger in the jungle',
-                    image=Image(url='tigerphoto.png'),
-                    price=100,
-                ),
-                MyDocument(
-                    caption='A swimming turtle', image=Image(url='turtlepic.png'), price=50
-                ),
-                MyDocument(
-                    caption='A couple birdwatching with binoculars',
-                    image=Image(url='binocularsphoto.png'),
-                    price=30,
-                ),
-            ]
-        )
-        query = {
-            '$and': {
-                'image__url': {'$regex': 'photo'},
-                'price': {'$lte': 50},
-            }
+    docs = DocArray[MyDocument](
+        [
+            MyDocument(
+                caption='A tiger in the jungle',
+                ImageDoc=ImageDoc(url='tigerphoto.png'),
+                price=100,
+            ),
+            MyDocument(
+                caption='A swimming turtle',
+                ImageDoc=ImageDoc(url='turtlepic.png'),
+                price=50,
+            ),
+            MyDocument(
+                caption='A couple birdwatching with binoculars',
+                ImageDoc=ImageDoc(url='binocularsphoto.png'),
+                price=30,
+            ),
+        ]
+    )
+    query = {
+        '$and': {
+            'ImageDoc__url': {'$regex': 'photo'},
+            'price': {'$lte': 50},
         }
+    }
+
+    results = filter_docs(docs, query)
+    assert len(results) == 1
+    assert results[0].price == 30
+    assert results[0].caption == 'A couple birdwatching with binoculars'
+    assert results[0].ImageDoc.url == 'binocularsphoto.png'
+    ```
 
-        results = filter_docs(docs, query)
-        assert len(results) == 1
-        assert results[0].price == 30
-        assert results[0].caption == 'A couple birdwatching with binoculars'
-        assert results[0].image.url == 'binocularsphoto.png'
+    ---
 
     :param docs: the DocArray where to apply the filter
     :param query: the query to filter by

diff --git a/docarray/utils/find.py b/docarray/utils/find.py
@@ -1,3 +1,5 @@
+__all__ = ['find', 'find_batched']
+
 from typing import Any, Dict, List, NamedTuple, Optional, Type, Union, cast
 
 from typing_inspect import is_union_type
@@ -34,52 +36,48 @@ def find(
     Find the closest Documents in the index to the query.
     Supports PyTorch and NumPy embeddings.
 
-    .. note::
-        This utility function is likely to be removed once
-        Document Stores are available.
-        At that point, and in-memory Document Store will serve the same purpose
-        by exposing a .find() method.
-
-    .. note::
-        This is a simple implementation that assumes the same embedding field name for
-        both query and index, does not support nested search, and does not support
-        hybrid (multi-vector) search. These shortcoming will be addressed in future
-        versions.
+    !!! note
+        This is a simple implementation of exact search. If you need to do advance
+        search using approximate nearest neighbours search or hybrid search or
+        multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc]
 
-    EXAMPLE USAGE
+    ---
 
-    .. code-block:: python
+    ```python
+    from docarray import DocArray, BaseDoc
+    from docarray.typing import TorchTensor
+    from docarray.utils.find import find
+    import torch
 
-        from docarray import DocArray, BaseDoc
-        from docarray.typing import TorchTensor
-        from docarray.util.find import find
 
+    class MyDocument(BaseDoc):
+        embedding: TorchTensor
 
-        class MyDocument(BaseDoc):
-            embedding: TorchTensor
 
+    index = DocArray[MyDocument](
+        [MyDocument(embedding=torch.rand(128)) for _ in range(100)]
+    )
 
-        index = DocArray[MyDocument](
-            [MyDocument(embedding=torch.rand(128)) for _ in range(100)]
-        )
+    # use Document as query
+    query = MyDocument(embedding=torch.rand(128))
+    top_matches, scores = find(
+        index=index,
+        query=query,
+        embedding_field='embedding',
+        metric='cosine_sim',
+    )
 
-        # use Document as query
-        query = MyDocument(embedding=torch.rand(128))
-        top_matches, scores = find(
-            index=index,
-            query=query,
-            embedding_field='tensor',
-            metric='cosine_sim',
-        )
+    # use tensor as query
+    query = torch.rand(128)
+    top_matches, scores = find(
+        index=index,
+        query=query,
+        embedding_field='embedding',
+        metric='cosine_sim',
+    )
+    ```
 
-        # use tensor as query
-        query = torch.rand(128)
-        top_matches, scores = find(
-            index=index,
-            query=query,
-            embedding_field='tensor',
-            metric='cosine_sim',
-        )
+    ---
 
     :param index: the index of Documents to search in
     :param query: the query to search for
@@ -123,54 +121,51 @@ def find_batched(
     Find the closest Documents in the index to the queries.
     Supports PyTorch and NumPy embeddings.
 
-    .. note::
-        This utility function is likely to be removed once
-        Document Stores are available.
-        At that point, and in-memory Document Store will serve the same purpose
-        by exposing a .find() method.
-
-    .. note::
-        This is a simple implementation that assumes the same embedding field name for
-        both query and index, does not support nested search, and does not support
-        hybrid (multi-vector) search. These shortcoming will be addressed in future
-        versions.
-
-        EXAMPLE USAGE
-
-    .. code-block:: python
-
-        from docarray import DocArray, BaseDoc
-        from docarray.typing import TorchTensor
-        from docarray.util.find import find
-
-
-        class MyDocument(BaseDoc):
-            embedding: TorchTensor
-
-
-        index = DocArray[MyDocument](
-            [MyDocument(embedding=torch.rand(128)) for _ in range(100)]
-        )
-
-        # use DocArray as query
-        query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
-        results = find(
-            index=index,
-            query=query,
-            embedding_field='tensor',
-            metric='cosine_sim',
-        )
-        top_matches, scores = results[0]
-
-        # use tensor as query
-        query = torch.rand(3, 128)
-        results, scores = find(
-            index=index,
-            query=query,
-            embedding_field='tensor',
-            metric='cosine_sim',
-        )
-        top_matches, scores = results[0]
+    !!! note
+        This is a simple implementation of exact search. If you need to do advance
+        search using approximate nearest neighbours search or hybrid search or
+        multi vector search please take a look at the [BaseDoc][docarray.base_doc.doc.BaseDoc]
+
+
+    ---
+
+    ```python
+    # from docarray import DocArray, BaseDoc
+    # from docarray.typing import TorchTensor
+    # from docarray.utils.find import find
+    # import torch
+    #
+    #
+    # class MyDocument(BaseDoc):
+    #     embedding: TorchTensor
+    #
+    #
+    # index = DocArray[MyDocument](
+    #     [MyDocument(embedding=torch.rand(128)) for _ in range(100)]
+    # )
+    #
+    # # use DocArray as query
+    # query = DocArray[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
+    # results = find(
+    #     index=index,
+    #     query=query,
+    #     embedding_field='embedding',
+    #     metric='cosine_sim',
+    # )
+    # top_matches, scores = results[0]
+    #
+    # # use tensor as query
+    # query = torch.rand(3, 128)
+    # results, scores = find(
+    #     index=index,
+    #     query=query,
+    #     embedding_field='embedding',
+    #     metric='cosine_sim',
+    # )
+    # top_matches, scores = results[0]
+    ```
+
+    ---
 
     :param index: the index of Documents to search in
     :param query: the query to search for