Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
dce8c6b
feat: load from and to csv
Feb 17, 2023
37892a9
fix: from to csv
Feb 20, 2023
e1e90cf
feat: add access path to dict
Feb 20, 2023
7bd7f1f
fix: from to csv
Feb 20, 2023
05e19ba
fix: clean up
Feb 20, 2023
5b0c760
docs: add docstring and update tmpdir in test
Feb 20, 2023
5babdae
fix: merge nested dicts
Feb 20, 2023
d6b29c5
fix: clean up
Feb 20, 2023
9149e57
fix: clean up
Feb 20, 2023
3d980b7
test: update test
Feb 20, 2023
516ffbb
fix: apply samis suggestion from code review
Feb 20, 2023
5fac964
Merge branch 'feat-rewrite-v2' into feat-from-to-csv
Feb 20, 2023
a0d9711
fix: apply suggestions from code review wrt access paths
Feb 21, 2023
c9005b1
fix: apply johannes suggestion
Feb 21, 2023
9fe58f5
fix: apply johannes suggestion
Feb 21, 2023
90867bb
fix: apply suggestions from code review
Feb 21, 2023
1a395da
fix: apply suggestions from code review
Feb 21, 2023
00a9ea7
fix: typos
Feb 21, 2023
e06e533
refactor: move helper functions to helper file
Feb 21, 2023
c8e4cf8
test: fix fixture
Feb 21, 2023
6e3a624
feat: add to and from pandas df for documentarray
Feb 22, 2023
12df134
chore: add pandas to pyproject.toml
Feb 22, 2023
504c13c
docs: update docstring
Feb 22, 2023
4bf8976
fix: mypy
Feb 22, 2023
ca2bf4f
fix: clean up
Feb 22, 2023
81ea38d
Merge remote-tracking branch 'origin/feat-rewrite-v2' into feat-from-…
Feb 22, 2023
5d02b35
fix: apply suggestions from code review
Feb 23, 2023
687d98a
fix: apply suggestion from johannes
Feb 23, 2023
2c02bf4
fix: apply suggestion from johannes
Feb 23, 2023
006a186
fix: apply suggestions from code review
Feb 23, 2023
c68a48c
fix: apply suggestion
Feb 23, 2023
fb74c4f
Merge branch 'feat-rewrite-v2' into feat-from-to-pandas
Feb 23, 2023
9d90053
fix: apply suggestions
Feb 23, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 116 additions & 25 deletions docarray/array/array/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@
Dict,
Generator,
Iterable,
List,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Union,
)

import pandas as pd
Comment thread
anna-charlotte marked this conversation as resolved.
Outdated

from docarray.base_document import AnyDocument, BaseDocument
from docarray.helper import (
_access_path_to_dict,
Expand Down Expand Up @@ -302,6 +304,56 @@ def to_json(self) -> str:
"""
return json.dumps([doc.json() for doc in self])

@classmethod
def _check_for_valid_document_type(cls) -> None:

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer returning a bool and rasing at the call site. That way it is clearer where the error actually occurs, and this method can be re-used in contexts where only the information is needed, without wanting to raise an exception

if cls.document_type == AnyDocument:

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there are plan to make this check more sophisticated as part of this PR? otherwise this doesn't need to be a method imo

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, just tried to extract all the duplicate code but maybe I went a bit overboard hehe, moved it back to from_csv and from_pandas again

raise TypeError(
'There is no document schema defined. '
'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.'
)

@classmethod
def _check_for_valid_access_paths(cls, field_names: Optional[List[str]]) -> None:

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

if field_names is None or len(field_names) == 0:
raise TypeError("No field names are given.")

valid = [
is_access_path_valid(cls.document_type, field) for field in field_names
]
if not all(valid):
raise ValueError(
f'Column names do not match the schema of the DocumentArray\'s document type '
f'({cls.document_type.__name__}): {list(compress(field_names, [not v for v in valid]))}'
)

@staticmethod
def access_path_dict_to_nested_dict(
Comment thread
anna-charlotte marked this conversation as resolved.
Outdated

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

didn't you already implement something like this in your previous PR? or am i mixing things up?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, the function _access_path_to_dict that is used within _access_path_dict_to_nested_dict that I implemented in the last PR.

  • _access_path_to_dict: transforms one access path to nested dict
  • _ access_path_dict_to_nested_dict: transforms dict with (multiple) access path keys to a joint nested dict by calling the former func

I will move the latter to that other helper file, to keep those functions together yes?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok sounds good, yes keeping them in the same helper file would be ideal

access_path2val: Dict[str, Any]
) -> Dict[Any, Any]:
"""
Convert a dict, where the keys are access paths ("__"-separated) to a nested dictionary.

EXAMPLE USAGE

.. code-block:: python

access_path2val = {'image__url': 'some.png'}
assert access_path_dict_to_nested_dict(access_path2val) == {
'image': {'url': 'some.png'}
}

:param access_path2val: dict with access_paths as keys
:return: nested dict where the access path keys are split into separate field names and nested keys
"""
nested_dict: Dict[Any, Any] = {}
for access_path, value in access_path2val.items():
field2val = _access_path_to_dict(
access_path=access_path,
value=value if value not in ['', 'None'] else None,
)
_update_nested_dicts(to_update=nested_dict, update_with=field2val)
return nested_dict

@classmethod
def from_csv(
cls,
Expand Down Expand Up @@ -330,37 +382,23 @@ def from_csv(
"""
from docarray import DocumentArray

doc_type = cls.document_type
if doc_type == AnyDocument:
raise TypeError(
'There is no document schema defined. '
'To load from csv, please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.'
)
cls._check_for_valid_document_type()

doc_type = cls.document_type
da = DocumentArray.__class_getitem__(doc_type)()

with open(file_path, 'r', encoding=encoding) as fp:
rows = csv.DictReader(fp, dialect=dialect)
field_names: Optional[Sequence[Any]] = rows.fieldnames
field_names: List[str] = (
[] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
)

if field_names is None:
raise TypeError("No field names are given.")

valid = [is_access_path_valid(doc_type, field) for field in field_names]
if not all(valid):
raise ValueError(
f'Fields provided in the csv file do not match the schema of the DocumentArray\'s '
f'document type ({doc_type.__name__}): {list(compress(field_names, [not v for v in valid]))}'
)
cls._check_for_valid_access_paths(field_names=field_names)

for access_path2val in rows:
doc_dict: Dict[Any, Any] = {}
for access_path, value in access_path2val.items():
field2val = _access_path_to_dict(
access_path=access_path,
value=value if value not in ['', 'None'] else None,
)
_update_nested_dicts(to_update=doc_dict, update_with=field2val)

doc_dict: Dict[Any, Any] = IOMixinArray.access_path_dict_to_nested_dict(
access_path2val
)
da.append(doc_type.parse_obj(doc_dict))

return da
Expand Down Expand Up @@ -392,6 +430,59 @@ def to_csv(
doc_dict = _dict_to_access_paths(doc.dict())
writer.writerow(doc_dict)

@classmethod
def from_pandas(cls, df: pd.DataFrame) -> 'DocumentArray':
"""
Load a DocumentArray from a `pandas.DataFrame` following the schema
defined in the :attr:`~docarray.DocumentArray.document_type` attribute.
Every row of the dataframe will be mapped to one document in the array.
Comment thread
anna-charlotte marked this conversation as resolved.
Outdated
The column names of the dataframe have to match the field namesof the
Comment thread
anna-charlotte marked this conversation as resolved.
Outdated
Document type.
For nested fields use "__"-separated access paths as column names,
such as 'image__url'.

List-like fields (including field of type DocumentArray) are not supported.

Comment thread
anna-charlotte marked this conversation as resolved.
:param df: pandas.DataFrame to extract Document's information from
:return: DocumentArray
Comment thread
anna-charlotte marked this conversation as resolved.
Outdated
"""
from docarray import DocumentArray

cls._check_for_valid_document_type()

doc_type = cls.document_type
da = DocumentArray.__class_getitem__(doc_type)()
field_names = df.columns.tolist()

cls._check_for_valid_access_paths(field_names=field_names)

for row in df.itertuples():
access_path2val = row._asdict()
access_path2val.pop('Index', None)
doc_dict = IOMixinArray.access_path_dict_to_nested_dict(access_path2val)
da.append(doc_type.parse_obj(doc_dict))

return da

def to_pandas(self) -> pd.DataFrame:
"""
Save a DocumentArray to a `pandas.DataFrame`.
The field names will be stored as column names. Each row of the dataframe corresponds
to the information of one Document.
Columns for nested fields will be named after the "__"-seperated access paths,
such as `"image__url"` for `image.url`.

:return: pandas.DataFrame
"""
fields = self.document_type._get_access_paths()
df = pd.DataFrame(columns=fields)

for doc in self:
doc_dict = _dict_to_access_paths(doc.dict())
df = df.append(doc_dict, ignore_index=True)

return df

# Methods to load from/to files in different formats
@property
def _stream_header(self) -> bytes:
Expand Down
42 changes: 38 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ fastapi = {version = ">=0.87.0", optional = true }
rich = ">=13.1.0"
lz4 = {version= ">=1.0.0", optional = true}
pydub = {version = "^0.25.1", optional = true }
pandas = ">=1.1.0"

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should be an optional dependecy

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add it to common or a as a separate extra pandas ?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

separate. we will reorginaze at some point anyway


[tool.poetry.extras]
common = ["protobuf", "lz4"]
Expand Down Expand Up @@ -60,6 +61,10 @@ check_untyped_defs = true
module = "av"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "pandas"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "trimesh"
ignore_missing_imports = true
Expand Down
4 changes: 1 addition & 3 deletions tests/units/array/test_array_from_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,7 @@ def test_from_csv_without_schema_raise_exception():


def test_from_csv_with_wrong_schema_raise_exception(nested_doc):
with pytest.raises(
ValueError, match='Fields provided in the csv file do not match the schema'
):
with pytest.raises(ValueError, match='Column names do not match the schema'):
DocumentArray[nested_doc.__class__].from_csv(
file_path=str(TOYDATA_DIR / 'docs.csv')
)
85 changes: 85 additions & 0 deletions tests/units/array/test_array_from_to_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from typing import Optional

import pandas as pd
import pytest

from docarray import BaseDocument, DocumentArray
from docarray.documents import Image


@pytest.fixture()
def nested_doc_cls():
class MyDoc(BaseDocument):
count: Optional[int]
text: str

class MyDocNested(MyDoc):
image: Image

return MyDocNested


def test_to_from_pandas_df(nested_doc_cls):
da = DocumentArray[nested_doc_cls](
[
nested_doc_cls(
count=0,
text='hello',
image=Image(url='aux.png'),
),
nested_doc_cls(text='hello world', image=Image()),
]
)
df = da.to_pandas()
assert isinstance(df, pd.DataFrame)
assert len(df) == 2
assert (
df.columns
== [
'id',
'count',
'text',
'image__id',
'image__url',
'image__tensor',
'image__embedding',
'image__bytes',
]
).all()

da_from_df = DocumentArray[nested_doc_cls].from_pandas(df)
for doc1, doc2 in zip(da, da_from_df):
assert doc1 == doc2


@pytest.fixture()
def nested_doc():
class Inner(BaseDocument):
img: Optional[Image]

class Middle(BaseDocument):
img: Optional[Image]
inner: Optional[Inner]

class Outer(BaseDocument):
img: Optional[Image]
middle: Optional[Middle]

doc = Outer(img=Image(), middle=Middle(img=Image(), inner=Inner(img=Image())))
return doc


def test_from_pandas_without_schema_raise_exception():
with pytest.raises(TypeError, match='no document schema defined'):
df = pd.DataFrame(
columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]]
)
DocumentArray.from_pandas(df=df)


def test_from_pandas_with_wrong_schema_raise_exception(nested_doc):
with pytest.raises(ValueError, match='Column names do not match the schema'):
df = pd.DataFrame(
columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]]
)
DocumentArray[nested_doc.__class__].from_pandas(df=df)