Skip to content

Commit

Permalink
[wip] add mix scheme (PaddlePaddle#664)
Browse files Browse the repository at this point in the history
## PaddleMIX统一多模数据格式
1. [x] 纯文
2. [x] 单图
3. [x] 多图
4. [x] interleaved
5. [ ] 音频
6. [ ] 视频 

## 功能
1. [x]  `MIX`格式定义和检查
2. [x] `MM`格式到`MIX`格式转换Op

---

## 特殊字段
1. [x] `images <-> <image>id</image>`
2. [ ] `audios <-> <audio>id</audio>`
3. [ ] `videos <-> <video>id</video>`


```
[
    {
        'id': '000002b66c9c498e',
        'images': [
                {
                    'id': 0,
                    'url': 'train/000002b66c9c498e.jpg', 
                    'heigh': 100,
                    'width': 100,
                }, 
                {
                    'id': 1,
                    'url': 'train/000002b66c9c498e.jpg', 
                    'heigh': 100,
                    'width': 100,
                }, 
            ],
        'conversations': [
                {
                    'from': 'user', 
                    'value': '<image>id</image><image>id</image> xxxx'
                }, 
                {
                    'from': 'assistant', 
                    'value': 'xxx'
                },
                {
                    'from': 'user', 
                    'value': 'xxxx <image>id</image>'
                }, 
                {
                    'from': 'assistant', 
                    'value': 'xxx'
                }
            ],
    },
]
```
  • Loading branch information
lyuwenyu authored Sep 24, 2024
1 parent 84efc75 commit 164ad73
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 15 deletions.
18 changes: 8 additions & 10 deletions paddlemix/datacopilot/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,13 @@
# limitations under the License.


from .schema import (
T,
SCHEMA,
is_valid_schema,
)
from .schema import T, SCHEMA, is_valid_schema
from .dataset import MMDataset, ParallelMode
from .register import register

from .dataset import (
MMDataset,
ParallelMode
)

from .register import register
MODILATY_TOKENS = {
'image': '<image>\d+</image>',
'audio': '<audio>\d+</audio>',
'video': '<video>\d+</video>',
}
5 changes: 5 additions & 0 deletions paddlemix/datacopilot/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ def shuffle(self, seed: Optional[int]=None) -> 'MMDataset':
random.shuffle(self._items)
return self

def sample(self, k: int) -> 'MMDataset':
indices = random.sample(range(len(self)), k)
items = [self.items[i] for i in indices]
return MMDataset(items)

@classmethod
def from_json(cls, path: str, schema: SCHEMA=SCHEMA.MM) -> 'MMDataset':
with open(path, 'r') as f:
Expand Down
68 changes: 68 additions & 0 deletions paddlemix/datacopilot/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,74 @@ class SCHEMA(Enum):
required: ['id', 'image', 'conversations']
"""

MIX = \
"""
$id: 'https://example.com/schemas/multimodal_mix'
$schema: 'https://json-schema.org/draft/2020-12/schema'
type: object
properties:
id:
anyOf:
-
type: string
pattern: '\S{1,}'
-
type: integer
minimum: 0
images:
anyOf:
-
type: 'null'
-
type: array
minItems: 1
items:
type: object
properties:
id:
type: integer
minimum: 0
url:
type: string
pattern: '\.(jpg|jpeg|png|webp|JPG|JPEG|PNG)$'
description: '.png or .jpg or .jpeg or .webp'
heigh:
type: integer
minimum: 0
width:
type: integer
minimum: 0
required:
- id
- url
conversations:
type: array
minItems: 1
items:
type: object
properties:
from:
type: string
description: 'user or assistant'
enum:
- user
- assistant
value:
anyOf:
- type: string
- type: 'null'
required:
- from
- value
required:
- id
- images
- conversations
"""

SCHEMA_VALIDATORS = {
k: JsonSchemaValidator.from_string(k.value) for k in SCHEMA
}
Expand Down
5 changes: 3 additions & 2 deletions paddlemix/datacopilot/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from ._info import info, head
from ._h5 import from_h5, check_h5, export_h5

from .analysis import *
from .convert import *
17 changes: 17 additions & 0 deletions paddlemix/datacopilot/ops/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from ._info import info, head

Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from rich.table import Table
from rich.columns import Columns

from ..core import register, MMDataset
from ...core import register, MMDataset


@register(force=True)
Expand Down
17 changes: 17 additions & 0 deletions paddlemix/datacopilot/ops/convert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from ._h5 import from_h5, check_h5, export_h5
from ._schema import convert_schema
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@

from typing import List, Union

from ..core import MMDataset, SCHEMA
from ..misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk
from ...core import MMDataset, SCHEMA
from ...misc import ParallelMode, parallel_map, freeze_rng_state, enumerate_chunk

__all__ = ['export_h5', 'check_h5', 'from_h5']

Expand Down
73 changes: 73 additions & 0 deletions paddlemix/datacopilot/ops/convert/_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from ...core import T
from ...core import SCHEMA, is_valid_schema


def convert_schema(
item: T,
in_schema: SCHEMA=SCHEMA.MM,
out_schema: SCHEMA=SCHEMA.MIX
)-> T:
"""convert scheme
"""
if in_schema == out_schema:
return item

# MM <-> MIX
elif in_schema == SCHEMA.MM and out_schema == SCHEMA.MIX:
return _convert_mm_mix(item)

else:
raise NotImplementedError('')


def _convert_mm_mix(item):
if 'image' in item:
images = [{
'id': 0,
'url': item['image'],
}]
else:
images = None

conversations = []
for conv in item['conversations']:
if conv['from'] == 'human':
role = 'user'
if 'image' in item:
if '<image>' in conv['value']:
value = conv['value'].replace('<image>', '<image>0</image>')
else:
value = '<image>0</image>\n' + conv['value']
else:
value = conv['value']
else:
role = 'assistant'
value = conv['value']

conversations.append({
'from': role,
'value': value,
})

newitem = {
'id': item['id'],
'images': images,
'conversations': conversations
}
return newitem

0 comments on commit 164ad73

Please sign in to comment.