-
Notifications
You must be signed in to change notification settings - Fork 244
feat: multi modal document #188
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
89fb6a4
f16b19d
f084c0d
74f8fe1
cec8fe9
48cd192
c5002b4
d051f6c
13c4690
56f7415
18376d3
c714764
0899401
411fe77
c0421dd
1f0069f
3458c5b
f0a4735
2a654b9
c5211fe
06a1e98
ae18a9b
254f9a9
6756a03
38ae6ac
0f591b8
95a77cb
a23d858
a4c9103
e6f2be6
188548f
6ece923
dce5114
0a2d996
d4c950e
5127695
2e659a4
e88c968
755d9e4
6793fb3
3af7214
c546ad9
aca7d07
5534b8c
762c3f2
de142ff
8de0a3d
bd13124
32743df
fddb749
bf8064f
b697baf
9ec34f6
d8a351e
389e6ac
b998cc0
c060008
c2b6800
acf88c2
807367d
3cfcab3
8da0779
0c3267f
903ed42
447ce51
70e565d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,140 @@ | ||
| import base64 | ||
|
|
||
| import typing | ||
| from enum import Enum | ||
|
|
||
| from docarray.types.multimodal import Image, Text, Field, is_dataclass | ||
| from docarray.types.multimodal import TYPES_REGISTRY | ||
|
|
||
| if typing.TYPE_CHECKING: | ||
| from docarray import Document, DocumentArray | ||
|
|
||
|
|
||
| class AttributeType(str, Enum): | ||
| DOCUMENT = 'document' | ||
| PRIMITIVE = 'primitive' | ||
| ITERABLE_PRIMITIVE = 'iterable_primitive' | ||
| ITERABLE_DOCUMENT = 'iterable_document' | ||
| NESTED = 'nested' | ||
| ITERABLE_NESTED = 'iterable_nested' | ||
|
|
||
|
|
||
| class MultiModalMixin: | ||
| @classmethod | ||
| def from_dataclass(cls, obj): | ||
| if not is_dataclass(obj): | ||
|
alaeddine-13 marked this conversation as resolved.
|
||
| raise ValueError(f'Object {obj.__name__} is not a dataclass instance') | ||
|
|
||
| from docarray import Document | ||
|
|
||
| root = Document() | ||
| tags = {} | ||
| multi_modal_schema = {} | ||
| for key, field in obj.__dataclass_fields__.items(): | ||
| attribute = getattr(obj, key) | ||
| if field.type in [str, int, float, bool] and not isinstance(field, Field): | ||
| tags[key] = attribute | ||
| multi_modal_schema[key] = { | ||
| 'attribute_type': AttributeType.PRIMITIVE, | ||
| 'type': field.type.__name__, | ||
| } | ||
|
|
||
| elif field.type == bytes and not isinstance(field, Field): | ||
| tags[key] = base64.b64encode(attribute).decode() | ||
| multi_modal_schema[key] = { | ||
| 'attribute_type': AttributeType.PRIMITIVE, | ||
| 'type': field.type.__name__, | ||
| } | ||
| elif isinstance(field.type, typing._GenericAlias): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this mean?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it handles types like |
||
| if field.type._name in ['List', 'Iterable']: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't safer to check the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find it better to stick to the type that the user explicitly provided. |
||
| sub_type = field.type.__args__[0] | ||
| if sub_type in [str, int, float, bool]: | ||
|
alaeddine-13 marked this conversation as resolved.
|
||
| tags[key] = attribute | ||
| multi_modal_schema[key] = { | ||
| 'attribute_type': AttributeType.ITERABLE_PRIMITIVE, | ||
| 'type': f'{field.type._name}[{sub_type.__name__}]', | ||
| } | ||
|
|
||
| else: | ||
| chunk = Document() | ||
| for element in attribute: | ||
| doc, attribute_type = cls._from_obj( | ||
| element, sub_type, field | ||
| ) | ||
| if attribute_type == AttributeType.DOCUMENT: | ||
| attribute_type = AttributeType.ITERABLE_DOCUMENT | ||
| elif attribute_type == AttributeType.NESTED: | ||
| attribute_type = AttributeType.ITERABLE_NESTED | ||
| else: | ||
| raise ValueError( | ||
| f'Unsupported type annotation inside Iterable: {sub_type}' | ||
| ) | ||
| chunk.chunks.append(doc) | ||
| multi_modal_schema[key] = { | ||
| 'attribute_type': attribute_type, | ||
| 'type': f'{field.type._name}[{sub_type.__name__}]', | ||
| 'position': len(root.chunks), | ||
| } | ||
| root.chunks.append(chunk) | ||
| else: | ||
| raise ValueError(f'Unsupported type annotation {field.type._name}') | ||
| else: | ||
| doc, attribute_type = cls._from_obj(attribute, field.type, field) | ||
| multi_modal_schema[key] = { | ||
| 'attribute_type': attribute_type, | ||
| 'type': field.type.__name__, | ||
| 'position': len(root.chunks), | ||
| } | ||
| root.chunks.append(doc) | ||
|
|
||
| # TODO: may have to modify this? | ||
| root.tags = tags | ||
|
numb3r3 marked this conversation as resolved.
|
||
| root._metadata['multi_modal_schema'] = multi_modal_schema | ||
|
|
||
| return root | ||
|
|
||
| def get_multi_modal_attribute(self, attribute: str) -> 'DocumentArray': | ||
| from docarray import DocumentArray | ||
|
|
||
| if 'multi_modal_schema' not in self._metadata: | ||
| raise ValueError( | ||
| 'the Document does not correspond to a Multi Modal Document' | ||
| ) | ||
|
|
||
| if attribute not in self._metadata['multi_modal_schema']: | ||
| raise ValueError( | ||
| f'the Document schema does not contain attribute {attribute}' | ||
| ) | ||
|
|
||
| attribute_type = self._metadata['multi_modal_schema'][attribute][ | ||
| 'attribute_type' | ||
| ] | ||
| position = self._metadata['multi_modal_schema'][attribute].get('position') | ||
|
|
||
| if attribute_type in [AttributeType.DOCUMENT, AttributeType.NESTED]: | ||
| return DocumentArray([self.chunks[position]]) | ||
| elif attribute_type in [ | ||
| AttributeType.ITERABLE_DOCUMENT, | ||
| AttributeType.ITERABLE_NESTED, | ||
| ]: | ||
| return self.chunks[position].chunks | ||
| else: | ||
| raise ValueError( | ||
| f'Invalid attribute {attribute}: must a Document attribute or nested dataclass' | ||
| ) | ||
|
|
||
| @classmethod | ||
| def _from_obj(cls, obj, obj_type, field) -> typing.Tuple['Document', AttributeType]: | ||
| from docarray import Document | ||
|
|
||
| attribute_type = AttributeType.DOCUMENT | ||
|
|
||
| if is_dataclass(obj_type): | ||
| doc = cls.from_dataclass(obj) | ||
| attribute_type = AttributeType.NESTED | ||
| elif isinstance(field, Field): | ||
| doc = Document() | ||
| field.serializer(obj, field.name, doc) | ||
| else: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also consider this case
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the typing module doesn't offer a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm also not quite clear about it. Does it make sense to offer an annotation of
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, I think the modality of the type should be clear. |
||
| raise ValueError(f'Unsupported type annotation') | ||
| return doc, attribute_type | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess there must be some update of the
docstringsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated