diff --git a/paddlemix/datacopilot/core/__init__.py b/paddlemix/datacopilot/core/__init__.py index aa4a4cdac..fe74282c5 100644 --- a/paddlemix/datacopilot/core/__init__.py +++ b/paddlemix/datacopilot/core/__init__.py @@ -13,15 +13,13 @@ # limitations under the License. -from .schema import ( - T, - SCHEMA, - is_valid_schema, -) +from .schema import T, SCHEMA, is_valid_schema +from .dataset import MMDataset, ParallelMode +from .register import register -from .dataset import ( - MMDataset, - ParallelMode -) -from .register import register +MODILATY_TOKENS = { + 'image': '\d+', + 'audio': '', + 'video': '', +} diff --git a/paddlemix/datacopilot/core/dataset.py b/paddlemix/datacopilot/core/dataset.py index 88122dac7..fabd1101b 100644 --- a/paddlemix/datacopilot/core/dataset.py +++ b/paddlemix/datacopilot/core/dataset.py @@ -122,6 +122,11 @@ def shuffle(self, seed: Optional[int]=None) -> 'MMDataset': random.shuffle(self._items) return self + def sample(self, k: int) -> 'MMDataset': + indices = random.sample(range(len(self)), k) + items = [self.items[i] for i in indices] + return MMDataset(items) + @classmethod def from_json(cls, path: str, schema: SCHEMA=SCHEMA.MM) -> 'MMDataset': with open(path, 'r') as f: diff --git a/paddlemix/datacopilot/core/schema.py b/paddlemix/datacopilot/core/schema.py index 127847733..3d0cfef1f 100644 --- a/paddlemix/datacopilot/core/schema.py +++ b/paddlemix/datacopilot/core/schema.py @@ -74,6 +74,74 @@ class SCHEMA(Enum): required: ['id', 'image', 'conversations'] """ + MIX = \ +""" +$id: 'https://example.com/schemas/multimodal_mix' +$schema: 'https://json-schema.org/draft/2020-12/schema' + +type: object +properties: + id: + anyOf: + - + type: string + pattern: '\S{1,}' + - + type: integer + minimum: 0 + + images: + anyOf: + - + type: 'null' + - + type: array + minItems: 1 + items: + type: object + properties: + id: + type: integer + minimum: 0 + url: + type: string + pattern: '\.(jpg|jpeg|png|webp|JPG|JPEG|PNG)$' + description: '.png or .jpg or .jpeg or .webp' + heigh: + type: integer + minimum: 0 + width: + type: integer + minimum: 0 + required: + - id + - url + + conversations: + type: array + minItems: 1 + items: + type: object + properties: + from: + type: string + description: 'user or assistant' + enum: + - user + - assistant + value: + anyOf: + - type: string + - type: 'null' + required: + - from + - value +required: + - id + - images + - conversations +""" + SCHEMA_VALIDATORS = { k: JsonSchemaValidator.from_string(k.value) for k in SCHEMA } diff --git a/paddlemix/datacopilot/ops/__init__.py b/paddlemix/datacopilot/ops/__init__.py index 2d59a8218..546d3c36c 100644 --- a/paddlemix/datacopilot/ops/__init__.py +++ b/paddlemix/datacopilot/ops/__init__.py @@ -12,5 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ._info import info, head -from ._h5 import from_h5, check_h5, export_h5 + +from .analysis import * +from .convert import * diff --git a/paddlemix/datacopilot/ops/analysis/__init__.py b/paddlemix/datacopilot/ops/analysis/__init__.py new file mode 100644 index 000000000..5cd684373 --- /dev/null +++ b/paddlemix/datacopilot/ops/analysis/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ._info import info, head + diff --git a/paddlemix/datacopilot/ops/_info.py b/paddlemix/datacopilot/ops/analysis/_info.py similarity index 97% rename from paddlemix/datacopilot/ops/_info.py rename to paddlemix/datacopilot/ops/analysis/_info.py index 8d3ff8515..f41a526e3 100644 --- a/paddlemix/datacopilot/ops/_info.py +++ b/paddlemix/datacopilot/ops/analysis/_info.py @@ -17,7 +17,7 @@ from rich.table import Table from rich.columns import Columns -from ..core import register, MMDataset +from ...core import register, MMDataset @register(force=True) diff --git a/paddlemix/datacopilot/ops/convert/__init__.py b/paddlemix/datacopilot/ops/convert/__init__.py new file mode 100644 index 000000000..f0cd76ed3 --- /dev/null +++ b/paddlemix/datacopilot/ops/convert/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ._h5 import from_h5, check_h5, export_h5 +from ._schema import convert_schema \ No newline at end of file diff --git a/paddlemix/datacopilot/ops/_h5.py b/paddlemix/datacopilot/ops/convert/_h5.py similarity index 98% rename from paddlemix/datacopilot/ops/_h5.py rename to paddlemix/datacopilot/ops/convert/_h5.py index a5fb743ca..f70fe746e 100644 --- a/paddlemix/datacopilot/ops/_h5.py +++ b/paddlemix/datacopilot/ops/convert/_h5.py @@ -24,8 +24,8 @@ from typing import List, Union -from ..core import MMDataset, SCHEMA -from ..misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk +from ...core import MMDataset, SCHEMA +from ...misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk __all__ = ['export_h5', 'check_h5', 'from_h5'] diff --git a/paddlemix/datacopilot/ops/convert/_schema.py b/paddlemix/datacopilot/ops/convert/_schema.py new file mode 100644 index 000000000..5edd6f823 --- /dev/null +++ b/paddlemix/datacopilot/ops/convert/_schema.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from ...core import T +from ...core import SCHEMA, is_valid_schema + + +def convert_schema( + item: T, + in_schema: SCHEMA=SCHEMA.MM, + out_schema: SCHEMA=SCHEMA.MIX +)-> T: + """convert scheme + """ + if in_schema == out_schema: + return item + + # MM <-> MIX + elif in_schema == SCHEMA.MM and out_schema == SCHEMA.MIX: + return _convert_mm_mix(item) + + else: + raise NotImplementedError('') + + +def _convert_mm_mix(item): + if 'image' in item: + images = [{ + 'id': 0, + 'url': item['image'], + }] + else: + images = None + + conversations = [] + for conv in item['conversations']: + if conv['from'] == 'human': + role = 'user' + if 'image' in item: + if '' in conv['value']: + value = conv['value'].replace('', '0') + else: + value = '0\n' + conv['value'] + else: + value = conv['value'] + else: + role = 'assistant' + value = conv['value'] + + conversations.append({ + 'from': role, + 'value': value, + }) + + newitem = { + 'id': item['id'], + 'images': images, + 'conversations': conversations + } + return newitem +