diff --git a/paddlemix/datacopilot/core/__init__.py b/paddlemix/datacopilot/core/__init__.py
index aa4a4cdac..fe74282c5 100644
--- a/paddlemix/datacopilot/core/__init__.py
+++ b/paddlemix/datacopilot/core/__init__.py
@@ -13,15 +13,13 @@
# limitations under the License.
-from .schema import (
- T,
- SCHEMA,
- is_valid_schema,
-)
+from .schema import T, SCHEMA, is_valid_schema
+from .dataset import MMDataset, ParallelMode
+from .register import register
-from .dataset import (
- MMDataset,
- ParallelMode
-)
-from .register import register
+MODILATY_TOKENS = {
+ 'image': '\d+',
+ 'audio': '',
+ 'video': '',
+}
diff --git a/paddlemix/datacopilot/core/dataset.py b/paddlemix/datacopilot/core/dataset.py
index 88122dac7..fabd1101b 100644
--- a/paddlemix/datacopilot/core/dataset.py
+++ b/paddlemix/datacopilot/core/dataset.py
@@ -122,6 +122,11 @@ def shuffle(self, seed: Optional[int]=None) -> 'MMDataset':
random.shuffle(self._items)
return self
+ def sample(self, k: int) -> 'MMDataset':
+ indices = random.sample(range(len(self)), k)
+ items = [self.items[i] for i in indices]
+ return MMDataset(items)
+
@classmethod
def from_json(cls, path: str, schema: SCHEMA=SCHEMA.MM) -> 'MMDataset':
with open(path, 'r') as f:
diff --git a/paddlemix/datacopilot/core/schema.py b/paddlemix/datacopilot/core/schema.py
index 127847733..3d0cfef1f 100644
--- a/paddlemix/datacopilot/core/schema.py
+++ b/paddlemix/datacopilot/core/schema.py
@@ -74,6 +74,74 @@ class SCHEMA(Enum):
required: ['id', 'image', 'conversations']
"""
+ MIX = \
+"""
+$id: 'https://example.com/schemas/multimodal_mix'
+$schema: 'https://json-schema.org/draft/2020-12/schema'
+
+type: object
+properties:
+ id:
+ anyOf:
+ -
+ type: string
+ pattern: '\S{1,}'
+ -
+ type: integer
+ minimum: 0
+
+ images:
+ anyOf:
+ -
+ type: 'null'
+ -
+ type: array
+ minItems: 1
+ items:
+ type: object
+ properties:
+ id:
+ type: integer
+ minimum: 0
+ url:
+ type: string
+ pattern: '\.(jpg|jpeg|png|webp|JPG|JPEG|PNG)$'
+ description: '.png or .jpg or .jpeg or .webp'
+ heigh:
+ type: integer
+ minimum: 0
+ width:
+ type: integer
+ minimum: 0
+ required:
+ - id
+ - url
+
+ conversations:
+ type: array
+ minItems: 1
+ items:
+ type: object
+ properties:
+ from:
+ type: string
+ description: 'user or assistant'
+ enum:
+ - user
+ - assistant
+ value:
+ anyOf:
+ - type: string
+ - type: 'null'
+ required:
+ - from
+ - value
+required:
+ - id
+ - images
+ - conversations
+"""
+
SCHEMA_VALIDATORS = {
k: JsonSchemaValidator.from_string(k.value) for k in SCHEMA
}
diff --git a/paddlemix/datacopilot/ops/__init__.py b/paddlemix/datacopilot/ops/__init__.py
index 2d59a8218..546d3c36c 100644
--- a/paddlemix/datacopilot/ops/__init__.py
+++ b/paddlemix/datacopilot/ops/__init__.py
@@ -12,5 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from ._info import info, head
-from ._h5 import from_h5, check_h5, export_h5
+
+from .analysis import *
+from .convert import *
diff --git a/paddlemix/datacopilot/ops/analysis/__init__.py b/paddlemix/datacopilot/ops/analysis/__init__.py
new file mode 100644
index 000000000..5cd684373
--- /dev/null
+++ b/paddlemix/datacopilot/ops/analysis/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ._info import info, head
+
diff --git a/paddlemix/datacopilot/ops/_info.py b/paddlemix/datacopilot/ops/analysis/_info.py
similarity index 97%
rename from paddlemix/datacopilot/ops/_info.py
rename to paddlemix/datacopilot/ops/analysis/_info.py
index 8d3ff8515..f41a526e3 100644
--- a/paddlemix/datacopilot/ops/_info.py
+++ b/paddlemix/datacopilot/ops/analysis/_info.py
@@ -17,7 +17,7 @@
from rich.table import Table
from rich.columns import Columns
-from ..core import register, MMDataset
+from ...core import register, MMDataset
@register(force=True)
diff --git a/paddlemix/datacopilot/ops/convert/__init__.py b/paddlemix/datacopilot/ops/convert/__init__.py
new file mode 100644
index 000000000..f0cd76ed3
--- /dev/null
+++ b/paddlemix/datacopilot/ops/convert/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ._h5 import from_h5, check_h5, export_h5
+from ._schema import convert_schema
\ No newline at end of file
diff --git a/paddlemix/datacopilot/ops/_h5.py b/paddlemix/datacopilot/ops/convert/_h5.py
similarity index 98%
rename from paddlemix/datacopilot/ops/_h5.py
rename to paddlemix/datacopilot/ops/convert/_h5.py
index a5fb743ca..f70fe746e 100644
--- a/paddlemix/datacopilot/ops/_h5.py
+++ b/paddlemix/datacopilot/ops/convert/_h5.py
@@ -24,8 +24,8 @@
from typing import List, Union
-from ..core import MMDataset, SCHEMA
-from ..misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk
+from ...core import MMDataset, SCHEMA
+from ...misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk
__all__ = ['export_h5', 'check_h5', 'from_h5']
diff --git a/paddlemix/datacopilot/ops/convert/_schema.py b/paddlemix/datacopilot/ops/convert/_schema.py
new file mode 100644
index 000000000..5edd6f823
--- /dev/null
+++ b/paddlemix/datacopilot/ops/convert/_schema.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...core import T
+from ...core import SCHEMA, is_valid_schema
+
+
+def convert_schema(
+ item: T,
+ in_schema: SCHEMA=SCHEMA.MM,
+ out_schema: SCHEMA=SCHEMA.MIX
+)-> T:
+ """convert scheme
+ """
+ if in_schema == out_schema:
+ return item
+
+ # MM <-> MIX
+ elif in_schema == SCHEMA.MM and out_schema == SCHEMA.MIX:
+ return _convert_mm_mix(item)
+
+ else:
+ raise NotImplementedError('')
+
+
+def _convert_mm_mix(item):
+ if 'image' in item:
+ images = [{
+ 'id': 0,
+ 'url': item['image'],
+ }]
+ else:
+ images = None
+
+ conversations = []
+ for conv in item['conversations']:
+ if conv['from'] == 'human':
+ role = 'user'
+ if 'image' in item:
+ if '' in conv['value']:
+ value = conv['value'].replace('', '0')
+ else:
+ value = '0\n' + conv['value']
+ else:
+ value = conv['value']
+ else:
+ role = 'assistant'
+ value = conv['value']
+
+ conversations.append({
+ 'from': role,
+ 'value': value,
+ })
+
+ newitem = {
+ 'id': item['id'],
+ 'images': images,
+ 'conversations': conversations
+ }
+ return newitem
+