docarray · anna-charlotte · Feb 23, 2023 · Feb 17, 2023 · Feb 20, 2023 · Feb 20, 2023
diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py
@@ -16,14 +16,16 @@
     Dict,
     Generator,
     Iterable,
+    List,
     Optional,
-    Sequence,
     Tuple,
     Type,
     TypeVar,
     Union,
 )
 
+import pandas as pd
+
 from docarray.base_document import AnyDocument, BaseDocument
 from docarray.helper import (
     _access_path_to_dict,
@@ -302,6 +304,56 @@ def to_json(self) -> str:
         """
         return json.dumps([doc.json() for doc in self])
 
+    @classmethod
+    def _check_for_valid_document_type(cls) -> None:
+        if cls.document_type == AnyDocument:
+            raise TypeError(
+                'There is no document schema defined. '
+                'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.'
+            )
+
+    @classmethod
+    def _check_for_valid_access_paths(cls, field_names: Optional[List[str]]) -> None:
+        if field_names is None or len(field_names) == 0:
+            raise TypeError("No field names are given.")
+
+        valid = [
+            is_access_path_valid(cls.document_type, field) for field in field_names
+        ]
+        if not all(valid):
+            raise ValueError(
+                f'Column names do not match the schema of the DocumentArray\'s document type '
+                f'({cls.document_type.__name__}): {list(compress(field_names, [not v for v in valid]))}'
+            )
+
+    @staticmethod
+    def access_path_dict_to_nested_dict(
+        access_path2val: Dict[str, Any]
+    ) -> Dict[Any, Any]:
+        """
+        Convert a dict, where the keys are access paths ("__"-separated) to a nested dictionary.
+
+        EXAMPLE USAGE
+
+        .. code-block:: python
+
+            access_path2val = {'image__url': 'some.png'}
+            assert access_path_dict_to_nested_dict(access_path2val) == {
+                'image': {'url': 'some.png'}
+            }
+
+        :param access_path2val: dict with access_paths as keys
+        :return: nested dict where the access path keys are split into separate field names and nested keys
+        """
+        nested_dict: Dict[Any, Any] = {}
+        for access_path, value in access_path2val.items():
+            field2val = _access_path_to_dict(
+                access_path=access_path,
+                value=value if value not in ['', 'None'] else None,
+            )
+            _update_nested_dicts(to_update=nested_dict, update_with=field2val)
+        return nested_dict
+
     @classmethod
     def from_csv(
         cls,
@@ -330,37 +382,23 @@ def from_csv(
         """
         from docarray import DocumentArray
 
-        doc_type = cls.document_type
-        if doc_type == AnyDocument:
-            raise TypeError(
-                'There is no document schema defined. '
-                'To load from csv, please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.'
-            )
+        cls._check_for_valid_document_type()
 
+        doc_type = cls.document_type
         da = DocumentArray.__class_getitem__(doc_type)()
+
         with open(file_path, 'r', encoding=encoding) as fp:
             rows = csv.DictReader(fp, dialect=dialect)
-            field_names: Optional[Sequence[Any]] = rows.fieldnames
+            field_names: List[str] = (
+                [] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
+            )
 
-            if field_names is None:
-                raise TypeError("No field names are given.")
-
-            valid = [is_access_path_valid(doc_type, field) for field in field_names]
-            if not all(valid):
-                raise ValueError(
-                    f'Fields provided in the csv file do not match the schema of the DocumentArray\'s '
-                    f'document type ({doc_type.__name__}): {list(compress(field_names, [not v for v in valid]))}'
-                )
+            cls._check_for_valid_access_paths(field_names=field_names)
 
             for access_path2val in rows:
-                doc_dict: Dict[Any, Any] = {}
-                for access_path, value in access_path2val.items():
-                    field2val = _access_path_to_dict(
-                        access_path=access_path,
-                        value=value if value not in ['', 'None'] else None,
-                    )
-                    _update_nested_dicts(to_update=doc_dict, update_with=field2val)
-
+                doc_dict: Dict[Any, Any] = IOMixinArray.access_path_dict_to_nested_dict(
+                    access_path2val
+                )
                 da.append(doc_type.parse_obj(doc_dict))
 
         return da
@@ -392,6 +430,59 @@ def to_csv(
                 doc_dict = _dict_to_access_paths(doc.dict())
                 writer.writerow(doc_dict)
 
+    @classmethod
+    def from_pandas(cls, df: pd.DataFrame) -> 'DocumentArray':
+        """
+        Load a DocumentArray from a `pandas.DataFrame` following the schema
+        defined in the :attr:`~docarray.DocumentArray.document_type` attribute.
+        Every row of the dataframe will be mapped to one document in the array.
+        The column names of the dataframe have to match the field namesof the
+        Document type.
+        For nested fields use "__"-separated access paths as column names,
+        such as 'image__url'.
+
+        List-like fields (including field of type DocumentArray) are not supported.
+
+        :param df: pandas.DataFrame to extract Document's information from
+        :return: DocumentArray
+        """
+        from docarray import DocumentArray
+
+        cls._check_for_valid_document_type()
+
+        doc_type = cls.document_type
+        da = DocumentArray.__class_getitem__(doc_type)()
+        field_names = df.columns.tolist()
+
+        cls._check_for_valid_access_paths(field_names=field_names)
+
+        for row in df.itertuples():
+            access_path2val = row._asdict()
+            access_path2val.pop('Index', None)
+            doc_dict = IOMixinArray.access_path_dict_to_nested_dict(access_path2val)
+            da.append(doc_type.parse_obj(doc_dict))
+
+        return da
+
+    def to_pandas(self) -> pd.DataFrame:
+        """
+        Save a DocumentArray to a `pandas.DataFrame`.
+        The field names will be stored as column names. Each row of the dataframe corresponds
+        to the information of one Document.
+        Columns for nested fields will be named after the "__"-seperated access paths,
+        such as `"image__url"` for `image.url`.
+
+        :return: pandas.DataFrame
+        """
+        fields = self.document_type._get_access_paths()
+        df = pd.DataFrame(columns=fields)
+
+        for doc in self:
+            doc_dict = _dict_to_access_paths(doc.dict())
+            df = df.append(doc_dict, ignore_index=True)
+
+        return df
+
     # Methods to load from/to files in different formats
     @property
     def _stream_header(self) -> bytes:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ fastapi = {version = ">=0.87.0", optional = true }
 rich = ">=13.1.0"
 lz4 = {version= ">=1.0.0", optional = true}
 pydub = {version = "^0.25.1", optional = true }
+pandas = ">=1.1.0"
 
 [tool.poetry.extras]
 common = ["protobuf", "lz4"]
@@ -60,6 +61,10 @@ check_untyped_defs = true
 module = "av"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "pandas"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = "trimesh"
 ignore_missing_imports = true

diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py
@@ -93,9 +93,7 @@ def test_from_csv_without_schema_raise_exception():
 
 
 def test_from_csv_with_wrong_schema_raise_exception(nested_doc):
-    with pytest.raises(
-        ValueError, match='Fields provided in the csv file do not match the schema'
-    ):
+    with pytest.raises(ValueError, match='Column names do not match the schema'):
         DocumentArray[nested_doc.__class__].from_csv(
             file_path=str(TOYDATA_DIR / 'docs.csv')
         )
diff --git a/tests/units/array/test_array_from_to_pandas.py b/tests/units/array/test_array_from_to_pandas.py
@@ -0,0 +1,85 @@
+from typing import Optional
+
+import pandas as pd
+import pytest
+
+from docarray import BaseDocument, DocumentArray
+from docarray.documents import Image
+
+
+@pytest.fixture()
+def nested_doc_cls():
+    class MyDoc(BaseDocument):
+        count: Optional[int]
+        text: str
+
+    class MyDocNested(MyDoc):
+        image: Image
+
+    return MyDocNested
+
+
+def test_to_from_pandas_df(nested_doc_cls):
+    da = DocumentArray[nested_doc_cls](
+        [
+            nested_doc_cls(
+                count=0,
+                text='hello',
+                image=Image(url='aux.png'),
+            ),
+            nested_doc_cls(text='hello world', image=Image()),
+        ]
+    )
+    df = da.to_pandas()
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 2
+    assert (
+        df.columns
+        == [
+            'id',
+            'count',
+            'text',
+            'image__id',
+            'image__url',
+            'image__tensor',
+            'image__embedding',
+            'image__bytes',
+        ]
+    ).all()
+
+    da_from_df = DocumentArray[nested_doc_cls].from_pandas(df)
+    for doc1, doc2 in zip(da, da_from_df):
+        assert doc1 == doc2
+
+
+@pytest.fixture()
+def nested_doc():
+    class Inner(BaseDocument):
+        img: Optional[Image]
+
+    class Middle(BaseDocument):
+        img: Optional[Image]
+        inner: Optional[Inner]
+
+    class Outer(BaseDocument):
+        img: Optional[Image]
+        middle: Optional[Middle]
+
+    doc = Outer(img=Image(), middle=Middle(img=Image(), inner=Inner(img=Image())))
+    return doc
+
+
+def test_from_pandas_without_schema_raise_exception():
+    with pytest.raises(TypeError, match='no document schema defined'):
+        df = pd.DataFrame(
+            columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]]
+        )
+        DocumentArray.from_pandas(df=df)
+
+
+def test_from_pandas_with_wrong_schema_raise_exception(nested_doc):
+    with pytest.raises(ValueError, match='Column names do not match the schema'):
+        df = pd.DataFrame(
+            columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]]
+        )
+        DocumentArray[nested_doc.__class__].from_pandas(df=df)