AI开发平台ModelArts-自定义模型规范:“train.py”示例
“train.py”示例
变量名称 |
说明 |
示例 |
---|---|---|
ENV_AG_MODEL_DIR |
模型存放路径,AI Gallery的模型仓库地址,包含模型仓库的所有文件。 |
“/home/ma-user/.cache/gallery/model/ur12345--gpt2” |
ENV_AG_DATASET_DIR |
数据集存放路径,AI Gallery的数据集仓库地址,包含数据集仓库的所有文件。 |
“/home/ma-user/.cache/gallery/dataset/ur12345--data_demo” |
ENV_AG_USER_PA RAM S |
配置的训练超参json字符串。创建训练任务时在算法配置页面设置的超参,用json字符串表示。 |
{"per_device_eval_batch_size":"32","lr":"0.001","logging_steps":"24"} |
ENV_AG_TRAIN_OUTPUT_DIR |
训练产物文件存放路径。训练产物将被保存到该路径。训练任务结束后,由AI Gallery平台将该目录上传到新模型的仓库中。 |
“/home/ma-user/.cache/gallery/output” |
训练数据的日志文件存放路径。训练过程中的迭代次数、LOSS和吞吐数据按照“迭代次数|loss|吞吐”格式记录在日志中,AI Gallery通过环境变量找到日志,从中获取实际数据绘制成“吞吐”和“训练LOSS”曲线,呈现在训练的“指标效果”中。具体请参见查看训练效果。
说明:
日志文件中的迭代次数、LOSS和吞吐数据必须按照“迭代次数|loss|吞吐”格式存放,否则AI Gallery会数据解析失败,导致“吞吐”和“训练LOSS”曲线异常。 |
“/var/logs/user_metrics.log” |
import jsonimport osfrom datasets import load_datasetfrom transformers import AutoImageProcessorfrom torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor, RandomHorizontalFlipimport numpy as npfrom transformers import AutoModelForImageClassification, TrainingArguments, Trainerfrom transformers import DefaultDataCollatorfrom sklearn import metrics# 环境变量# 工作目录ENV_AG_WORK_DIR = 'ENV_AG_WORK_DIR'# 模型存放路径ENV_AG_MODEL_DIR = 'ENV_AG_MODEL_DIR'# 数据集存放路径ENV_AG_DATASET_DIR = 'ENV_AG_DATASET_DIR'# 配置的训练超参json字符串ENV_AG_USER_PARAMS = 'ENV_AG_USER_PARAMS'# 训练产物存放路径ENV_AG_TRAIN_OUTPUT_DIR = 'ENV_AG_TRAIN_OUTPUT_DIR'_transforms = Nonedef _multi_class_classification_metrics(pred): raw_predictions, labels = pred predictions = np.argmax(raw_predictions, axis=1) results = { "f1_macro": metrics.f1_score(labels, predictions, average="macro"), "f1_micro": metrics.f1_score(labels, predictions, average="micro"), "f1_weighted": metrics.f1_score(labels, predictions, average="weighted"), "precision_macro": metrics.precision_score(labels, predictions, average="macro"), "precision_micro": metrics.precision_score(labels, predictions, average="micro"), "precision_weighted": metrics.precision_score(labels, predictions, average="weighted"), "recall_macro": metrics.recall_score(labels, predictions, average="macro"), "recall_micro": metrics.recall_score(labels, predictions, average="micro"), "recall_weighted": metrics.recall_score(labels, predictions, average="weighted"), "accuracy": metrics.accuracy_score(labels, predictions), } return resultsdef parse_args(): """ 从AIGallery环境变量中获取用户配置的超参json """ return json.loads(os.getenv(ENV_AG_USER_PARAMS))def _process_input_data(image_processor): # 加载数据集 dataset_path = os.getenv(ENV_AG_DATASET_DIR) dataset = load_dataset("imagefolder", data_dir=dataset_path) # 数据增强 normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) size = (image_processor.size["shortest_edge"] if "shortest_edge" in image_processor.size else ( image_processor.size["height"], image_processor.size["width"])) global _transforms _transforms = Compose([RandomResizedCrop(size), RandomHorizontalFlip(), ToTensor(), normalize]) ret = dataset.with_transform(_format_transforms) return ret# 转换函数def _format_transforms(examples): examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]] del examples["image"] return examplesdef train(user_args): print('Start to process dataset') model_path = os.getenv(ENV_AG_MODEL_DIR) image_processor = AutoImageProcessor.from_pretrained(model_path) dataset = _process_input_data(image_processor) print(f"Dataset: {dataset}") # label和id映射 classes = dataset["train"].features["label"].names label2id = {c: i for i, c in enumerate(classes)} id2label = {i: c for i, c in enumerate(classes)} print('Start to load model') # 加载模型 model = AutoModelForImageClassification.from_pretrained( model_path, num_labels=len(classes), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True ) print('Start to set training args') # 训练参数 training_args = TrainingArguments( output_dir=os.getenv(ENV_AG_TRAIN_OUTPUT_DIR), remove_unused_columns=False, evaluation_strategy="epoch", save_strategy=user_args['save_strategy'], learning_rate=float(user_args['lr']), save_total_limit=3, per_device_train_batch_size=32, gradient_accumulation_steps=1, per_device_eval_batch_size=int(user_args['per_device_eval_batch_size']), num_train_epochs=int(user_args['num_train_epochs']), warmup_ratio=float(user_args['warmup_ratio']), logging_steps=int(user_args['logging_steps']), load_best_model_at_end=True, metric_for_best_model="accuracy", push_to_hub=False, ) print('Start to train') # 训练参数 trainer = Trainer( model=model, args=training_args, data_collator=DefaultDataCollator(), train_dataset=dataset["train"], eval_dataset=dataset["test"], tokenizer=image_processor, compute_metrics=_multi_class_classification_metrics, ) # 开始训练 train_results = trainer.train() print('Start to save model') # 保存模型 trainer.save_model() trainer.log_metrics("train", train_results.metrics) trainer.save_metrics("train", train_results.metrics) trainer.save_state() print('Start to evaluate') # 在验证集上做准确性评估 eva_metrics = trainer.evaluate() trainer.log_metrics("eval", eva_metrics) trainer.save_metrics("eval", eva_metrics) print('All Done')if __name__ == '__main__': args = parse_args() train(args)