AI开发平台ModelArts-自定义模型规范:“train.py”示例

时间:2025-02-12 14:55:05

“train.py”示例

表4 环境变量说明

变量名称

说明

示例

ENV_AG_MODEL_DIR

模型存放路径,AI Gallery的模型仓库地址,包含模型仓库的所有文件。

“/home/ma-user/.cache/gallery/model/ur12345--gpt2”

ENV_AG_DATASET_DIR

数据集存放路径,AI Gallery的数据集仓库地址,包含数据集仓库的所有文件。

“/home/ma-user/.cache/gallery/dataset/ur12345--data_demo”

ENV_AG_USER_PA RAM S

配置的训练超参json字符串。创建训练任务时在算法配置页面设置的超参,用json字符串表示。

{"per_device_eval_batch_size":"32","lr":"0.001","logging_steps":"24"}

ENV_AG_TRAIN_OUTPUT_DIR

训练产物文件存放路径。训练产物将被保存到该路径。训练任务结束后,由AI Gallery平台将该目录上传到新模型的仓库中。

“/home/ma-user/.cache/gallery/output”

ENV_AG_USER_METRI CS _ LOG _PATH

训练数据的日志文件存放路径。训练过程中的迭代次数、LOSS和吞吐数据按照“迭代次数|loss|吞吐”格式记录在日志中,AI Gallery通过环境变量找到日志,从中获取实际数据绘制成“吞吐”“训练LOSS”曲线,呈现在训练的“指标效果”中。具体请参见查看训练效果

说明:

日志文件中的迭代次数、LOSS和吞吐数据必须按照“迭代次数|loss|吞吐”格式存放,否则AI Gallery会数据解析失败,导致“吞吐”“训练LOSS”曲线异常。

“/var/logs/user_metrics.log”

import jsonimport osfrom datasets import load_datasetfrom transformers import AutoImageProcessorfrom torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor, RandomHorizontalFlipimport numpy as npfrom transformers import AutoModelForImageClassification, TrainingArguments, Trainerfrom transformers import DefaultDataCollatorfrom sklearn import metrics# 环境变量# 工作目录ENV_AG_WORK_DIR = 'ENV_AG_WORK_DIR'# 模型存放路径ENV_AG_MODEL_DIR = 'ENV_AG_MODEL_DIR'# 数据集存放路径ENV_AG_DATASET_DIR = 'ENV_AG_DATASET_DIR'# 配置的训练超参json字符串ENV_AG_USER_PARAMS = 'ENV_AG_USER_PARAMS'# 训练产物存放路径ENV_AG_TRAIN_OUTPUT_DIR = 'ENV_AG_TRAIN_OUTPUT_DIR'_transforms = Nonedef _multi_class_classification_metrics(pred):    raw_predictions, labels = pred    predictions = np.argmax(raw_predictions, axis=1)    results = {        "f1_macro": metrics.f1_score(labels, predictions, average="macro"),        "f1_micro": metrics.f1_score(labels, predictions, average="micro"),        "f1_weighted": metrics.f1_score(labels, predictions, average="weighted"),        "precision_macro": metrics.precision_score(labels, predictions, average="macro"),        "precision_micro": metrics.precision_score(labels, predictions, average="micro"),        "precision_weighted": metrics.precision_score(labels, predictions, average="weighted"),        "recall_macro": metrics.recall_score(labels, predictions, average="macro"),        "recall_micro": metrics.recall_score(labels, predictions, average="micro"),        "recall_weighted": metrics.recall_score(labels, predictions, average="weighted"),        "accuracy": metrics.accuracy_score(labels, predictions),    }    return resultsdef parse_args():    """ 从AIGallery环境变量中获取用户配置的超参json """    return json.loads(os.getenv(ENV_AG_USER_PARAMS))def _process_input_data(image_processor):    # 加载数据集    dataset_path = os.getenv(ENV_AG_DATASET_DIR)    dataset = load_dataset("imagefolder", data_dir=dataset_path)    # 数据增强    normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)    size = (image_processor.size["shortest_edge"] if "shortest_edge" in image_processor.size else (        image_processor.size["height"], image_processor.size["width"]))    global _transforms    _transforms = Compose([RandomResizedCrop(size), RandomHorizontalFlip(), ToTensor(), normalize])    ret = dataset.with_transform(_format_transforms)    return ret# 转换函数def _format_transforms(examples):    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]    del examples["image"]    return examplesdef train(user_args):    print('Start to process dataset')    model_path = os.getenv(ENV_AG_MODEL_DIR)    image_processor = AutoImageProcessor.from_pretrained(model_path)    dataset = _process_input_data(image_processor)    print(f"Dataset: {dataset}")    # label和id映射    classes = dataset["train"].features["label"].names    label2id = {c: i for i, c in enumerate(classes)}    id2label = {i: c for i, c in enumerate(classes)}    print('Start to load model')    # 加载模型    model = AutoModelForImageClassification.from_pretrained(        model_path,        num_labels=len(classes),        id2label=id2label,        label2id=label2id,        ignore_mismatched_sizes=True    )    print('Start to set training args')    # 训练参数    training_args = TrainingArguments(        output_dir=os.getenv(ENV_AG_TRAIN_OUTPUT_DIR),        remove_unused_columns=False,        evaluation_strategy="epoch",        save_strategy=user_args['save_strategy'],        learning_rate=float(user_args['lr']),        save_total_limit=3,        per_device_train_batch_size=32,        gradient_accumulation_steps=1,        per_device_eval_batch_size=int(user_args['per_device_eval_batch_size']),        num_train_epochs=int(user_args['num_train_epochs']),        warmup_ratio=float(user_args['warmup_ratio']),        logging_steps=int(user_args['logging_steps']),        load_best_model_at_end=True,        metric_for_best_model="accuracy",        push_to_hub=False,    )    print('Start to train')    # 训练参数    trainer = Trainer(        model=model,        args=training_args,        data_collator=DefaultDataCollator(),        train_dataset=dataset["train"],        eval_dataset=dataset["test"],        tokenizer=image_processor,        compute_metrics=_multi_class_classification_metrics,    )    # 开始训练    train_results = trainer.train()    print('Start to save model')    # 保存模型    trainer.save_model()    trainer.log_metrics("train", train_results.metrics)    trainer.save_metrics("train", train_results.metrics)    trainer.save_state()    print('Start to evaluate')    # 在验证集上做准确性评估    eva_metrics = trainer.evaluate()    trainer.log_metrics("eval", eva_metrics)    trainer.save_metrics("eval", eva_metrics)    print('All Done')if __name__ == '__main__':    args = parse_args()    train(args)
support.huaweicloud.com/aimarket-modelarts/ma_gallery_0055.html