AI开发平台MODELARTS-创建数据集:示例代码

时间:2025-02-25 20:55:09

示例代码

  • 示例一:根据数据类型创建图像数据集
    from modelarts.session import Session
    from modelarts.dataset import Dataset
    
    session = Session()
    
    dataset_name = "dataset-image"  # 数据集名称
    data_type = "IMAGE"             # 数据集类型,图像类型数据集
    data_sources = dict()           # 数据集数据来源
    data_sources["type"] = 0        # 数据来源类型,0表示OBS
    data_sources["path"] = "/obs-gaia-test/data/image/image-classification/" # 数据在OBS中的路径
    work_path = dict()              # 数据集输出位置,用于存放输出的标注信息等文件
    work_path['type'] = 0           # 数据集工作目录的类型,0表示OBS
    work_path['path'] = "/obs-gaia-test/data/output/work_path/"  # 数据集工作目录在OBS中的路径
    create_dataset_resp = Dataset.create_dataset(session, dataset_name=dataset_name, data_type=data_type,
                                                 data_sources=data_sources, work_path=work_path)
  • 示例二:根据数据类型创建图像数据集(导入标注信息)
    from modelarts.session import Session
    from modelarts.dataset import Dataset
    
    session = Session()
    
    dataset_name = "dataset-image-with-annotations"
    data_type = "IMAGE"
    data_sources = dict()
    data_sources["type"] = 0
    data_sources["path"] = "/obs-gaia-test/data/image/image-classification/"
    annotation_config = dict()      # 源数据的标注格式
    annotation_config['scene'] = "image_classification" # 数据标注场景为图像分类标注
    annotation_config['format_name'] = "ModelArts image classification 1.0" # 标注格式为ModelArts image classification 1.0
    data_sources['annotation_config'] = annotation_config
    work_path = dict()
    work_path['type'] = 0
    work_path['path'] = "/obs-gaia-test/data/output/work_path/"
    create_dataset_resp = Dataset.create_dataset(session, dataset_name=dataset_name, data_type=data_type,
                                                 data_sources=data_sources, work_path=work_path)
  • 示例三:根据数据类型创建表格数据集
    from modelarts.session import Session
    from modelarts.dataset import Dataset
    
    session = Session()
    
    dataset_name = "dataset-table"
    data_type = "TABLE"
    data_sources = dict()
    data_sources["type"] = 0
    data_sources["path"] = "/obs-gaia-test/data/table/table0/"
    data_sources['with_column_header'] = True
    work_path = dict()
    work_path['type'] = 0
    work_path['path'] = "/obs-gaia-test/data/output/work_path/"
    # 表格类型的数据集需要指定表格数据的schema信息
    schema0 = dict()
    schema0['schema_id'] = 0
    schema0['name'] = "name"
    schema0['type'] = "STRING"
    schema1 = dict()
    schema1['schema_id'] = 1
    schema1['name'] = "age"
    schema1['type'] = "STRING"
    schema2 = dict()
    schema2['schema_id'] = 2
    schema2['name'] = "label"
    schema2['type'] = "STRING"
    schemas = []
    schemas.append(schema0)
    schemas.append(schema1)
    schemas.append(schema2)
    create_dataset_resp = Dataset.create_dataset(session, dataset_name=dataset_name, data_type=data_type,
                                                 data_sources=data_sources, work_path=work_path, schema=schemas)
  • 示例四:根据标注类型创建图像分类数据集
    from modelarts.session import Session
    from modelarts.dataset import Dataset
    
    session = Session()
    
    dataset_name = "dataset-image-classification"
    dataset_type = 0   # 数据集的标注类型,0表示图像分类标注类型
    data_sources = dict()
    data_sources["path"] = "/obs-gaia-test/data/image/image-classification/"
    data_sources["type"] = "0"
    work_path = dict()
    work_path['type'] = 0
    work_path['path'] = "/obs-gaia-test/data/output/work_path/"
    create_dataset_resp = Dataset.create_dataset(session, dataset_name=dataset_name, dataset_type=dataset_type, data_sources=data_sources, work_path=work_path)
  • 示例五:根据标注类型创建文本三元组数据集
    dataset_name = "dataset-text-triplet"
    dataset_type = 102   # 数据集标注类型,102表示文本三元组标注类型
    data_sources = dict()
    data_sources['type'] = 0
    data_sources['path'] = "/obs-gaia-test/data/text/text-classification/"
    work_path = dict()
    work_path['type'] = 0
    work_path['path'] = "/obs-gaia-test/data/output/work_path/"
    
    # 创建文本三元组标注类型的数据集,需要传入标签参数
    label_entity1 = dict()    # 标签对象
    label_entity1['name'] = "疾病"    # 标签名称
    label_entity1['type'] = 101     # 标签类型,101表示实体类型标签
    label_entity2 = dict()
    label_entity2['name'] = "疾病别称"
    label_entity2['type'] = 101
    label_relation1 = dict()
    label_relation1['name'] = "又称为"
    label_relation1['type'] = 102    # 标签类型,102表示关系类型标签
    property = dict()    # 关系类型标签需要在标签属性中指定起始实体标签和终止实体标签
    property['@modelarts:from_type'] = "疾病"    # 起始实体标签
    property['@modelarts:to_type'] = "疾病别称"    # 终止实体标签
    label_relation1['property'] = property
    labels = []
    labels.append(label_entity1)
    labels.append(label_entity2)
    labels.append(label_relation1)
    create_dataset_resp = Dataset.create_dataset(session, dataset_name=dataset_name, dataset_type=dataset_type, data_sources=data_sources, work_path=work_path, labels=labels)
  • 示例六:根据标注类型创建表格数据集
    dataset_name = "dataset-table"
    dataset_type = 400    # 数据集标注类型,400表示表格数据集
    data_sources = dict()
    data_sources['type'] = 0
    data_sources['path'] = "/obs-gaia-test/data/table/table0/"
    data_sources['with_column_header'] = True    # 用来指明表格数据中是否包含表头
    work_path = dict()
    work_path['type'] = 0
    work_path['path'] = "/obs-gaia-test/data/output/work_path/"
    
    # 表格数据集需要传入表格数据的表头参数
    schema0 = dict()    # 表格的表头
    schema0['schema_id'] = 0    # 第一列表头
    schema0['name'] = "name"    # 表头名称,该列表头为name
    schema0['type'] = "STRING"    # 表头数据类型,表示字符串
    schema1 = dict()
    schema1['schema_id'] = 1
    schema1['name'] = "age"
    schema1['type'] = "STRING"
    schema2 = dict()
    schema2['schema_id'] = 2
    schema2['name'] = "label"
    schema2['type'] = "STRING"
    schemas = []
    schemas.append(schema0)
    schemas.append(schema1)
    schemas.append(schema2)
    create_dataset_resp = Dataset.create_dataset(session, dataset_name=dataset_name, dataset_type=dataset_type, data_sources=data_sources, work_path=work_path, schema=schemas)
support.huaweicloud.com/sdkreference-modelarts/modelarts_04_0328.html