基于Qwen2-VL模型针对 ImageToText 任务进行微调训练 - 数据处理
flyfish
给定的图像生成一段自然语言描述。它的目标是生成一个或多个句子,能够准确地描述图像中的主要内容、物体、动作、场景等信息。例如,对于一张包含一只狗在草地上奔跑的图像,ImageToText 可能会生成 “一只狗在绿色的草地上快乐地奔跑” 这样的文字描述。
数据集描述
Image-caption task的数据集,包含train和valid
数据集简介
mscoco 2014的image caption数据集。数据集支持的任务
支持image caption任务数据集的格式和结构
数据格式
包含image_id, caption, image等信息。
数据集加载方式
from modelscope.msdatasets import MsDataset
ds = MsDataset.load("coco_2014_caption", namespace="modelscope", split="train")
print(ds[0])
生成的数据集格式
[{"id": "identity_12801","conversations": [{"from": "user","value": "/home/sss/datasets/1/coco_2014_caption/467265.jpg"},{"from": "assistant","value": "A group of young people standing in the middle of a street."}]},{"id": "identity_12802","conversations": [{"from": "user","value": "/home/sss/datasets/1/coco_2014_caption/227117.jpg"},{"from": "assistant","value": "Oven light on in a kitchen with wooden countertops. "}]},......
完整代码如下
import os
import pandas as pd
import json
import argparse
from modelscope.msdatasets import MsDatasetclass CocoCaptionProcessor:"""COCO 2014 Caption数据集处理器"""def __init__(self, max_data_number=500, dataset_dir='coco_2014_caption', csv_file='coco2014.csv'):"""初始化CocoCaptionProcessor:param max_data_number: 最大处理的数据量:param dataset_dir: 数据集保存目录:param csv_file: CSV文件路径"""self.max_data_number = max_data_numberself.dataset_dir = dataset_dirself.csv_file = csv_fileself.image_paths = []self.captions = []def download_and_process(self):"""从ModelScope下载并处理COCO 2014 Caption数据集"""if not os.path.exists(self.dataset_dir):ds = MsDataset.load('modelscope/coco_2014_caption', subset_name='coco_2014_caption', split='train')total = min(self.max_data_number, len(ds))os.makedirs(self.dataset_dir, exist_ok=True)for i in range(total):item = ds[i]image_id = item['image_id']caption = item['caption']image = item['image']image_path = os.path.abspath(f'{self.dataset_dir}/{image_id}.jpg')image.save(image_path)self.image_paths.append(image_path)self.captions.append(caption)if (i + 1) % 50 == 0:print(f'Processing {i+1}/{total} images ({(i+1)/total*100:.1f}%)')df = pd.DataFrame({'image_path': self.image_paths,'caption': self.captions})df.to_csv(self.csv_file, index=False)print(f'Data processing completed, processed a total of {total} images.')else:print(f'{self.dataset_dir} directory already exists, skipping data processing.')def generate_conversations_json(self, output_file='coco2014.json', train_ratio=0.8):"""根据CSV文件生成对话格式的JSON文件,并按给定比例分割训练集和验证集"""df = pd.read_csv(self.csv_file)conversations = []for i in range(len(df)):conversations.append({"id": f"identity_{i+1}","conversations": [{"from": "user", "value": df.iloc[i]['image_path']},{"from": "assistant", "value": df.iloc[i]['caption']}]})# 分割训练集和验证集split_index = int(len(conversations) * train_ratio)train_conversations = conversations[:split_index]val_conversations = conversations[split_index:]with open(output_file.replace('.json', '_train.json'), 'w', encoding='utf-8') as f:json.dump(train_conversations, f, ensure_ascii=False, indent=2)with open(output_file.replace('.json', '_val.json'), 'w', encoding='utf-8') as f:json.dump(val_conversations, f, ensure_ascii=False, indent=2)print(f'Generated JSON files for training and validation sets.')if __name__ == "__main__":parser = argparse.ArgumentParser(description="Process COCO 2014 Caption Dataset.",epilog="Example usage: python script.py --max_data_number 100 --dataset_dir ./data --csv_file ./output.csv")# 参数分组,让帮助信息更清晰data_options = parser.add_argument_group('Data Options')data_options.add_argument('--max_data_number', type=int, default=16000, help='Maximum number of data entries to process (default: %(default)s)')data_options.add_argument('--dataset_dir', type=str, default='coco_2014_caption', help='Directory to save the dataset (default: %(default)s)')data_options.add_argument('--csv_file', type=str, default='./coco2014.csv', help='Path to save the CSV file (default: %(default)s)')output_options = parser.add_argument_group('Output Options')output_options.add_argument('--output_file', type=str, default='coco2014.json', help='Base name for output JSON files (default: %(default)s)')output_options.add_argument('--train_ratio', type=float, default=0.8, help='Ratio of data to use for training set (default: %(default)s)')args = parser.parse_args()processor = CocoCaptionProcessor(max_data_number=args.max_data_number,dataset_dir=args.dataset_dir,csv_file=args.csv_file)processor.download_and_process()processor.generate_conversations_json(output_file=args.output_file, train_ratio=args.train_ratio)