-
Notifications
You must be signed in to change notification settings - Fork 244
feat(v2): add from and to pandas df for documentarray #1161
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 26 commits
dce8c6b
37892a9
e1e90cf
7bd7f1f
05e19ba
5b0c760
5babdae
d6b29c5
9149e57
3d980b7
516ffbb
5fac964
a0d9711
c9005b1
9fe58f5
90867bb
1a395da
00a9ea7
e06e533
c8e4cf8
6e3a624
12df134
504c13c
4bf8976
ca2bf4f
81ea38d
5d02b35
687d98a
2c02bf4
006a186
c68a48c
fb74c4f
9d90053
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,14 +16,16 @@ | |
| Dict, | ||
| Generator, | ||
| Iterable, | ||
| List, | ||
| Optional, | ||
| Sequence, | ||
| Tuple, | ||
| Type, | ||
| TypeVar, | ||
| Union, | ||
| ) | ||
|
|
||
| import pandas as pd | ||
|
|
||
| from docarray.base_document import AnyDocument, BaseDocument | ||
| from docarray.helper import ( | ||
| _access_path_to_dict, | ||
|
|
@@ -302,6 +304,56 @@ def to_json(self) -> str: | |
| """ | ||
| return json.dumps([doc.json() for doc in self]) | ||
|
|
||
| @classmethod | ||
| def _check_for_valid_document_type(cls) -> None: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer returning a bool and rasing at the call site. That way it is clearer where the error actually occurs, and this method can be re-used in contexts where only the information is needed, without wanting to raise an exception |
||
| if cls.document_type == AnyDocument: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there are plan to make this check more sophisticated as part of this PR? otherwise this doesn't need to be a method imo
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, just tried to extract all the duplicate code but maybe I went a bit overboard hehe, moved it back to |
||
| raise TypeError( | ||
| 'There is no document schema defined. ' | ||
| 'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' | ||
| ) | ||
|
|
||
| @classmethod | ||
| def _check_for_valid_access_paths(cls, field_names: Optional[List[str]]) -> None: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here |
||
| if field_names is None or len(field_names) == 0: | ||
| raise TypeError("No field names are given.") | ||
|
|
||
| valid = [ | ||
| is_access_path_valid(cls.document_type, field) for field in field_names | ||
| ] | ||
| if not all(valid): | ||
| raise ValueError( | ||
| f'Column names do not match the schema of the DocumentArray\'s document type ' | ||
| f'({cls.document_type.__name__}): {list(compress(field_names, [not v for v in valid]))}' | ||
| ) | ||
|
|
||
| @staticmethod | ||
| def access_path_dict_to_nested_dict( | ||
|
anna-charlotte marked this conversation as resolved.
Outdated
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. didn't you already implement something like this in your previous PR? or am i mixing things up?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, the function
I will move the latter to that other helper file, to keep those functions together yes?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok sounds good, yes keeping them in the same helper file would be ideal |
||
| access_path2val: Dict[str, Any] | ||
| ) -> Dict[Any, Any]: | ||
| """ | ||
| Convert a dict, where the keys are access paths ("__"-separated) to a nested dictionary. | ||
|
|
||
| EXAMPLE USAGE | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| access_path2val = {'image__url': 'some.png'} | ||
| assert access_path_dict_to_nested_dict(access_path2val) == { | ||
| 'image': {'url': 'some.png'} | ||
| } | ||
|
|
||
| :param access_path2val: dict with access_paths as keys | ||
| :return: nested dict where the access path keys are split into separate field names and nested keys | ||
| """ | ||
| nested_dict: Dict[Any, Any] = {} | ||
| for access_path, value in access_path2val.items(): | ||
| field2val = _access_path_to_dict( | ||
| access_path=access_path, | ||
| value=value if value not in ['', 'None'] else None, | ||
| ) | ||
| _update_nested_dicts(to_update=nested_dict, update_with=field2val) | ||
| return nested_dict | ||
|
|
||
| @classmethod | ||
| def from_csv( | ||
| cls, | ||
|
|
@@ -330,37 +382,23 @@ def from_csv( | |
| """ | ||
| from docarray import DocumentArray | ||
|
|
||
| doc_type = cls.document_type | ||
| if doc_type == AnyDocument: | ||
| raise TypeError( | ||
| 'There is no document schema defined. ' | ||
| 'To load from csv, please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.' | ||
| ) | ||
| cls._check_for_valid_document_type() | ||
|
|
||
| doc_type = cls.document_type | ||
| da = DocumentArray.__class_getitem__(doc_type)() | ||
|
|
||
| with open(file_path, 'r', encoding=encoding) as fp: | ||
| rows = csv.DictReader(fp, dialect=dialect) | ||
| field_names: Optional[Sequence[Any]] = rows.fieldnames | ||
| field_names: List[str] = ( | ||
| [] if rows.fieldnames is None else [str(f) for f in rows.fieldnames] | ||
| ) | ||
|
|
||
| if field_names is None: | ||
| raise TypeError("No field names are given.") | ||
|
|
||
| valid = [is_access_path_valid(doc_type, field) for field in field_names] | ||
| if not all(valid): | ||
| raise ValueError( | ||
| f'Fields provided in the csv file do not match the schema of the DocumentArray\'s ' | ||
| f'document type ({doc_type.__name__}): {list(compress(field_names, [not v for v in valid]))}' | ||
| ) | ||
| cls._check_for_valid_access_paths(field_names=field_names) | ||
|
|
||
| for access_path2val in rows: | ||
| doc_dict: Dict[Any, Any] = {} | ||
| for access_path, value in access_path2val.items(): | ||
| field2val = _access_path_to_dict( | ||
| access_path=access_path, | ||
| value=value if value not in ['', 'None'] else None, | ||
| ) | ||
| _update_nested_dicts(to_update=doc_dict, update_with=field2val) | ||
|
|
||
| doc_dict: Dict[Any, Any] = IOMixinArray.access_path_dict_to_nested_dict( | ||
| access_path2val | ||
| ) | ||
| da.append(doc_type.parse_obj(doc_dict)) | ||
|
|
||
| return da | ||
|
|
@@ -392,6 +430,59 @@ def to_csv( | |
| doc_dict = _dict_to_access_paths(doc.dict()) | ||
| writer.writerow(doc_dict) | ||
|
|
||
| @classmethod | ||
| def from_pandas(cls, df: pd.DataFrame) -> 'DocumentArray': | ||
| """ | ||
| Load a DocumentArray from a `pandas.DataFrame` following the schema | ||
| defined in the :attr:`~docarray.DocumentArray.document_type` attribute. | ||
| Every row of the dataframe will be mapped to one document in the array. | ||
|
anna-charlotte marked this conversation as resolved.
Outdated
|
||
| The column names of the dataframe have to match the field namesof the | ||
|
anna-charlotte marked this conversation as resolved.
Outdated
|
||
| Document type. | ||
| For nested fields use "__"-separated access paths as column names, | ||
| such as 'image__url'. | ||
|
|
||
| List-like fields (including field of type DocumentArray) are not supported. | ||
|
|
||
|
anna-charlotte marked this conversation as resolved.
|
||
| :param df: pandas.DataFrame to extract Document's information from | ||
| :return: DocumentArray | ||
|
anna-charlotte marked this conversation as resolved.
Outdated
|
||
| """ | ||
| from docarray import DocumentArray | ||
|
|
||
| cls._check_for_valid_document_type() | ||
|
|
||
| doc_type = cls.document_type | ||
| da = DocumentArray.__class_getitem__(doc_type)() | ||
| field_names = df.columns.tolist() | ||
|
|
||
| cls._check_for_valid_access_paths(field_names=field_names) | ||
|
|
||
| for row in df.itertuples(): | ||
| access_path2val = row._asdict() | ||
| access_path2val.pop('Index', None) | ||
| doc_dict = IOMixinArray.access_path_dict_to_nested_dict(access_path2val) | ||
| da.append(doc_type.parse_obj(doc_dict)) | ||
|
|
||
| return da | ||
|
|
||
| def to_pandas(self) -> pd.DataFrame: | ||
| """ | ||
| Save a DocumentArray to a `pandas.DataFrame`. | ||
| The field names will be stored as column names. Each row of the dataframe corresponds | ||
| to the information of one Document. | ||
| Columns for nested fields will be named after the "__"-seperated access paths, | ||
| such as `"image__url"` for `image.url`. | ||
|
|
||
| :return: pandas.DataFrame | ||
| """ | ||
| fields = self.document_type._get_access_paths() | ||
| df = pd.DataFrame(columns=fields) | ||
|
|
||
| for doc in self: | ||
| doc_dict = _dict_to_access_paths(doc.dict()) | ||
| df = df.append(doc_dict, ignore_index=True) | ||
|
|
||
| return df | ||
|
|
||
| # Methods to load from/to files in different formats | ||
| @property | ||
| def _stream_header(self) -> bytes: | ||
|
|
||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ fastapi = {version = ">=0.87.0", optional = true } | |
| rich = ">=13.1.0" | ||
| lz4 = {version= ">=1.0.0", optional = true} | ||
| pydub = {version = "^0.25.1", optional = true } | ||
| pandas = ">=1.1.0" | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it should be an optional dependecy
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add it to
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. separate. we will reorginaze at some point anyway |
||
|
|
||
| [tool.poetry.extras] | ||
| common = ["protobuf", "lz4"] | ||
|
|
@@ -60,6 +61,10 @@ check_untyped_defs = true | |
| module = "av" | ||
| ignore_missing_imports = true | ||
|
|
||
| [[tool.mypy.overrides]] | ||
| module = "pandas" | ||
| ignore_missing_imports = true | ||
|
|
||
| [[tool.mypy.overrides]] | ||
| module = "trimesh" | ||
| ignore_missing_imports = true | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| from typing import Optional | ||
|
|
||
| import pandas as pd | ||
| import pytest | ||
|
|
||
| from docarray import BaseDocument, DocumentArray | ||
| from docarray.documents import Image | ||
|
|
||
|
|
||
| @pytest.fixture() | ||
| def nested_doc_cls(): | ||
| class MyDoc(BaseDocument): | ||
| count: Optional[int] | ||
| text: str | ||
|
|
||
| class MyDocNested(MyDoc): | ||
| image: Image | ||
|
|
||
| return MyDocNested | ||
|
|
||
|
|
||
| def test_to_from_pandas_df(nested_doc_cls): | ||
| da = DocumentArray[nested_doc_cls]( | ||
| [ | ||
| nested_doc_cls( | ||
| count=0, | ||
| text='hello', | ||
| image=Image(url='aux.png'), | ||
| ), | ||
| nested_doc_cls(text='hello world', image=Image()), | ||
| ] | ||
| ) | ||
| df = da.to_pandas() | ||
| assert isinstance(df, pd.DataFrame) | ||
| assert len(df) == 2 | ||
| assert ( | ||
| df.columns | ||
| == [ | ||
| 'id', | ||
| 'count', | ||
| 'text', | ||
| 'image__id', | ||
| 'image__url', | ||
| 'image__tensor', | ||
| 'image__embedding', | ||
| 'image__bytes', | ||
| ] | ||
| ).all() | ||
|
|
||
| da_from_df = DocumentArray[nested_doc_cls].from_pandas(df) | ||
| for doc1, doc2 in zip(da, da_from_df): | ||
| assert doc1 == doc2 | ||
|
|
||
|
|
||
| @pytest.fixture() | ||
| def nested_doc(): | ||
| class Inner(BaseDocument): | ||
| img: Optional[Image] | ||
|
|
||
| class Middle(BaseDocument): | ||
| img: Optional[Image] | ||
| inner: Optional[Inner] | ||
|
|
||
| class Outer(BaseDocument): | ||
| img: Optional[Image] | ||
| middle: Optional[Middle] | ||
|
|
||
| doc = Outer(img=Image(), middle=Middle(img=Image(), inner=Inner(img=Image()))) | ||
| return doc | ||
|
|
||
|
|
||
| def test_from_pandas_without_schema_raise_exception(): | ||
| with pytest.raises(TypeError, match='no document schema defined'): | ||
| df = pd.DataFrame( | ||
| columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] | ||
| ) | ||
| DocumentArray.from_pandas(df=df) | ||
|
|
||
|
|
||
| def test_from_pandas_with_wrong_schema_raise_exception(nested_doc): | ||
| with pytest.raises(ValueError, match='Column names do not match the schema'): | ||
| df = pd.DataFrame( | ||
| columns=['title', 'count'], data=[['title 0', 0], ['title 1', 1]] | ||
| ) | ||
| DocumentArray[nested_doc.__class__].from_pandas(df=df) |
Uh oh!
There was an error while loading. Please reload this page.