diff --git a/.gitattributes b/.gitattributes index c7d9f3332a950355d5a77d85000f05e6f45435ea..a460cd62cd93a1387e4ee2c75b1c921f90c9df7f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -32,3 +32,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号4k壁纸.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号4k壁纸384.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号4k壁纸复杂.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号4k壁纸高清.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号4k壁纸精细.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号插画.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号水彩.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号素描.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上英文逗号油画.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上中文逗号.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上中文感叹号.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上中文句号.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上nega广告.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上nega广告符号.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_chinese/img/日出,海面上nega广告符号词汇.png filter=lfs diff=lfs merge=lfs -text +fengshen/examples/stable_diffusion_dreambooth/duck_result.png filter=lfs diff=lfs merge=lfs -text diff --git a/fengshen/API/main.py b/fengshen/API/main.py new file mode 100644 index 0000000000000000000000000000000000000000..bd4ac4c4cdad24a090de80ad2a5967d73d5f6099 --- /dev/null +++ b/fengshen/API/main.py @@ -0,0 +1,76 @@ +import uvicorn +import click +import argparse +import json +from importlib import import_module +from fastapi import FastAPI, WebSocket +from starlette.middleware.cors import CORSMiddleware +from utils import user_config, api_logger, setup_logger, RequestDataStructure + +# 命令行启动时只输入一个参数,即配置文件的名字,eg: text_classification.json +# 其余所有配置在该配置文件中设定,不在命令行中指定 +total_parser = argparse.ArgumentParser("API") +total_parser.add_argument("config_path", type=str) +args = total_parser.parse_args() + +# set up user config +user_config.setup_config(args) + +# set up logger +setup_logger(api_logger, user_config) + +# load pipeline +pipeline_class = getattr(import_module('fengshen.pipelines.' + user_config.pipeline_type), 'Pipeline') +model_settings = user_config.model_settings +model_args = argparse.Namespace(**model_settings) +pipeline = pipeline_class( + args = model_args, + model = user_config.model_name + ) + + +# initialize app +app = FastAPI( + title = user_config.PROJECT_NAME, + openapi_url = f"{user_config.API_PREFIX_STR}/openapi.json" + ) + + +# api +# TODO +# 需要针对不同请求方法做不同判断,目前仅跑通了较通用的POST方法 +# POST方法可以完成大多数 输入文本-返回结果 的请求任务 +if(user_config.API_method == "POST"): + @app.post(user_config.API_path, tags = user_config.API_tags) + async def fengshen_post(data:RequestDataStructure): + # logging + api_logger.info(data.input_text) + + input_text = data.input_text + + result = pipeline(input_text) + + return result +else: + print("only support POST method") + + + +# Set all CORS enabled origins +if user_config.BACKEND_CORS_ORIGINS: + app.add_middleware( + CORSMiddleware, + allow_origins = [str(origin) for origin in user_config.BACKEND_CORS_ORIGINS], + allow_credentials = user_config.allow_credentials, + allow_methods = user_config.allow_methods, + allow_headers = user_config.allow_headers, + ) + + +if __name__ == '__main__': + + # 启动后可在浏览器打开 host:port/docs 查看接口的具体信息,并可进行简单测试 + # eg: 127.0.0.1:8990/docs + uvicorn.run(app, host = user_config.SERVER_HOST, port = user_config.SERVER_PORT) + + diff --git a/fengshen/API/text_classification.json b/fengshen/API/text_classification.json new file mode 100644 index 0000000000000000000000000000000000000000..f510becb1306bd66faf2573dbd09044867c08efe --- /dev/null +++ b/fengshen/API/text_classification.json @@ -0,0 +1,46 @@ +{ + "SERVER": { + "SERVER_HOST": "127.0.0.1", + "SERVER_PORT": 8990, + "SERVER_NAME": "fengshen_demo", + "PROJECT_NAME": "fengshen_demo", + "API_PREFIX_STR": "/api", + + "API_method" : "POST", + "API_path": "/TextClassification", + "API_tags": ["TextClassification"], + + "BACKEND_CORS_ORIGINS": ["*"], + "allow_credentials": true, + "allow_methods": ["*"], + "allow_headers": ["*"] + + }, + "LOGGING": { + "log_file_path": "", + "log_level": "INFO" + }, + + "PIPELINE": { + "pipeline_type": "text_classification", + "model_name": "IDEA-CCNL/Erlangshen-Roberta-110M-Similarity", + "model_settings": { + "device": -1, + "texta_name": "sentence", + "textb_name": "sentence2", + "label_name": "label", + "max_length": 512, + "return_tensors": "pt", + "padding": "longest", + "truncation": true, + "skip_special_tokens": true, + "clean_up_tkenization_spaces": true, + + "skip_steps": 10, + "clip_guidance_scale": 7500, + "init_scale": 10 + } + } +} + + \ No newline at end of file diff --git a/fengshen/API/utils.py b/fengshen/API/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..56b2b9a193b17612c1b77d0f170c593b307bc7f3 --- /dev/null +++ b/fengshen/API/utils.py @@ -0,0 +1,167 @@ +from dataclasses import dataclass, field +import os +import json +import logging +from argparse import Namespace +from typing import List, Literal, Optional, Union +from pydantic import AnyHttpUrl, BaseSettings, HttpUrl, validator, BaseModel + + +CURRENT_DIR_PATH = os.path.dirname(os.path.abspath(__file__)) + +# request body +# 使用pydantic对请求中的body数据进行验证 +class RequestDataStructure(BaseModel): + input_text: List[str] = [""] + uuid: Optional[int] + + # parameters for text2image model + input_image: Optional[str] + skip_steps: Optional[int] + clip_guidance_scale: Optional[int] + init_scale: Optional[int] + +# API config +@dataclass +class APIConfig: + + # server config + SERVER_HOST: AnyHttpUrl = "127.0.0.1" + SERVER_PORT: int = 8990 + SERVER_NAME: str = "" + PROJECT_NAME: str = "" + API_PREFIX_STR: str = "/api" + + # api config + API_method: Literal["POST","GET","PUT","OPTIONS","WEBSOCKET","PATCH","DELETE","TRACE","CONNECT"] = "POST" + API_path: str = "/TextClassification" + API_tags: List[str] = field(default_factory = lambda: [""]) + + # CORS config + BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = field(default_factory = lambda: ["*"]) + allow_credentials: bool = True + allow_methods: List[str] = field(default_factory = lambda: ["*"]) + allow_headers: List[str] = field(default_factory = lambda: ["*"]) + + # log config + log_file_path: str = "" + log_level: str = "INFO" + + # pipeline config + pipeline_type: str = "" + model_name: str = "" + + # model config + # device: int = -1 + # texta_name: Optional[str] = "sentence" + # textb_name: Optional[str] = "sentence2" + # label_name: Optional[str] = "label" + # max_length: int = 512 + # return_tensors: str = "pt" + # padding: str = "longest" + # truncation: bool = True + # skip_special_tokens: bool = True + # clean_up_tkenization_spaces: bool = True + + # # parameters for text2image model + # skip_steps: Optional[int] = 0 + # clip_guidance_scale: Optional[int] = 0 + # init_scale: Optional[int] = 0 + + def setup_config(self, args:Namespace) -> None: + + # load config file + with open(CURRENT_DIR_PATH + "/" + args.config_path, "r") as jsonfile: + config = json.load(jsonfile) + + server_config = config["SERVER"] + logging_config = config["LOGGING"] + pipeline_config = config["PIPELINE"] + + # server config + self.SERVER_HOST: AnyHttpUrl = server_config["SERVER_HOST"] + self.SERVER_PORT: int = server_config["SERVER_PORT"] + self.SERVER_NAME: str = server_config["SERVER_NAME"] + self.PROJECT_NAME: str = server_config["PROJECT_NAME"] + self.API_PREFIX_STR: str = server_config["API_PREFIX_STR"] + + # api config + self.API_method: Literal["POST","GET","PUT","OPTIONS","WEBSOCKET","PATCH","DELETE","TRACE","CONNECT"] = server_config["API_method"] + self.API_path: str = server_config["API_path"] + self.API_tags: List[str] = server_config["API_tags"] + + # CORS config + self.BACKEND_CORS_ORIGINS: List[AnyHttpUrl] = server_config["BACKEND_CORS_ORIGINS"] + self.allow_credentials: bool = server_config["allow_credentials"] + self.allow_methods: List[str] = server_config["allow_methods"] + self.allow_headers: List[str] = server_config["allow_headers"] + + # log config + self.log_file_path: str = logging_config["log_file_path"] + self.log_level: str = logging_config["log_level"] + + # pipeline config + self.pipeline_type: str = pipeline_config["pipeline_type"] + self.model_name: str = pipeline_config["model_name"] + + # general model config + self.model_settings: dict = pipeline_config["model_settings"] + + # 由于pipeline本身会解析参数,后续参数可以不要 + # 直接将model_settings字典转为Namespace后作为pipeline的args参数即可 + + # self.device: int = self.model_settings["device"] + # self.texta_name: Optional[str] = self.model_settings["texta_name"] + # self.textb_name: Optional[str] = self.model_settings["textb_name"] + # self.label_name: Optional[str] = self.model_settings["label_name"] + # self.max_length: int = self.model_settings["max_length"] + # self.return_tensors: str = self.model_settings["return_tensors"] + # self.padding: str = self.model_settings["padding"] + # self.truncation: bool = self.model_settings["truncation"] + # self.skip_special_tokens: bool = self.model_settings["skip_special_tokens"] + # self.clean_up_tkenization_spaces: bool = self.model_settings["clean_up_tkenization_spaces"] + + # # specific parameters for text2image model + # self.skip_steps: Optional[int] = self.model_settings["skip_steps"] + # self.clip_guidance_scale: Optional[int] = self.model_settings["clip_guidance_scale"] + # self.init_scale: Optional[int] = self.model_settings["init_scale"] + + + +def setup_logger(logger, user_config: APIConfig): + + # default level: INFO + + logger.setLevel(getattr(logging, user_config.log_level, "INFO")) + ch = logging.StreamHandler() + + if(user_config.log_file_path == ""): + fh = logging.FileHandler(filename = CURRENT_DIR_PATH + "/" + user_config.SERVER_NAME + ".log") + elif(".log" not in user_config.log_file_path[-5:-1]): + fh = logging.FileHandler(filename = user_config.log_file_path + "/" + user_config.SERVER_NAME + ".log") + else: + fh = logging.FileHandler(filename = user_config.log_file_path) + + + formatter = logging.Formatter( + "%(asctime)s - %(module)s - %(funcName)s - line:%(lineno)d - %(levelname)s - %(message)s" + ) + + ch.setFormatter(formatter) + fh.setFormatter(formatter) + logger.addHandler(ch) # Exporting logs to the screen + logger.addHandler(fh) # Exporting logs to a file + + return logger + +user_config = APIConfig() +api_logger = logging.getLogger() + + + + + + + + + diff --git a/fengshen/README.md b/fengshen/README.md new file mode 100644 index 0000000000000000000000000000000000000000..45f7b3579c36a68f899a9a02cfcfbe1330d413d8 --- /dev/null +++ b/fengshen/README.md @@ -0,0 +1,105 @@ +## 最新发布 + +* \[2022.09.13\] [更新ErLangShen系列DeBERTa预训练代码](https://huggingface.co./IDEA-CCNL/Erlangshen-DeBERTa-v2-97M-Chinese) +* \[2022.09.13\] [更新RanDeng系列Bart预训练代码](https://huggingface.co./IDEA-CCNL/Randeng-BART-139M) +* \[2022.09.13\] [更新ErLangShen系列Bert预训练代码](https://huggingface.co./IDEA-CCNL/Erlangshen-MegatronBert-1.3B) +* \[2022.05.11\] [更新TaiYi系列VIT多模态模型及下游任务示例](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/太乙系列/Taiyi-vit-87M-D.html) +* \[2022.05.11\] [更新BiGan系列Transformer-XL去噪模型及下游任务示例](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/比干系列/Bigan-Transformer-XL-denoise-1.1B.html) +* \[2022.05.11\] [更新ErLangShen系列下游任务示例](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/二郎神系列/Erlangshen-Roberta-110M-NLI.html) + +# 导航 + +- [导航](#导航) + - [框架简介](#框架简介) + - [依赖环境](#依赖环境) + - [项目结构](#项目结构) + - [设计思路](#设计思路) + - [分类下游任务](#分类下游任务) + +## 框架简介 + +FengShen训练框架是封神榜大模型开源计划的重要一环,在大模型的生产和应用中起到至关重要的作用。FengShen可以应用在基于海量数据的预训练以及各种下游任务的finetune中。封神榜专注于NLP大模型开源,然而模型的增大带来不仅仅是训练的问题,在使用上也存在诸多不便。为了解决训练和使用的问题,FengShen参考了目前开源的优秀方案并且重新设计了Pipeline,用户可以根据自己的需求,从封神榜中选取丰富的预训练模型,同时利用FengShen快速微调下游任务。 + +目前所有实例以及文档可以查看我们的[Wiki](https://fengshenbang-doc.readthedocs.io/zh/latest/index.html) +所有的模型可以在[Huggingface主页](https://huggingface.co./IDEA-CCNL)找到 + +通过我们的框架,你可以快速享受到: + +1. 比原生torch更强的性能,训练速度提升**300%** +2. 支持更大的模型,支持**百亿级别**内模型训练及微调 +3. 支持**TB级以上**的数据集,在家用主机上即可享受预训练模型带来的效果提升 +3. 丰富的预训练、下游任务示例,一键开始训练 +4. 适应各种设备环境,支持在CPU、GPU、TPU等不同设备上运行 +5. 集成主流的分布式训练逻辑,无需修改代码即可支持DDP、Zero Optimizer等分布式优化技术 + +![avartar](../pics/fengshen_pic.png) + +## 依赖环境 + +* Python >= 3.8 +* torch >= 1.8 +* transformers >= 3.2.0 +* pytorch-lightning >= 1.5.10 + +在Fengshenbang-LM根目录下 +pip install --editable ./ + +## 项目结构 + +``` +├── data # 支持多种数据处理方式以及数据集 +│   ├── cbart_dataloader +| ├── fs_datasets # 基于transformers datasets的封装,新增中文数据集(开源计划中) +| ├── universal_datamodule # 打通fs_datasets与lightning datamodule,减少重复开发工作量 +│   ├── megatron_dataloader # 支持基于Megatron实现的TB级别数据集处理、训练 +│   ├── mmap_dataloader # 通用的Memmap形式的数据加载 +│   └── task_dataloader # 支持多种下游任务 +├── examples # 丰富的示例,从预训练到下游任务,应有尽有。 +├── metric # 提供各种metric计算,支持用户自定义metric +├── losses # 同样支持loss自定义,满足定制化需求 +├── tokenizer # 支持自定义tokenizer,比如我们使用的SentencePiece训练代码等 +├── models # 模型库 +│   ├── auto # 支持自动导入对应的模型 +│   ├── bart +│   ├── longformer +│   ├── megatron_t5 +│   └── roformer +└── utils # 实用函数 +``` + +## 设计思路 + +FengShen框架目前整体基于Pytorch-Lightning & Transformer进行开发,在底层框架上不断开源基于中文的预训练模型,同时提供丰富的examples,每一个封神榜的模型都能找到对应的预训练、下游任务代码。 + +在FengShen上开发,整体可以按照下面的三个步骤进行: + +1. 封装数据处理流程 -> pytorch_lightning.LightningDataModule +2. 封装模型结构 -> pytorch_lightning.LightningModule +3. 配置一些插件,比如log_monitor,checkpoint_callback等等。 + +一个完整的DEMO可以看Randeng-BART系列实例 -> [文档](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/燃灯系列/BART-139M.html) [代码](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/hf-ds/fengshen/examples/pretrain_bart) + +## 分类下游任务 + + 在examples/classification目录下,我们提供丰富的分类任务的示例,其中我们提供三个一键式运行的示例。 + +* demo_classification_afqmc_roberta.sh 使用DDP微调roberta +* demo_classification_afqmc_roberta_deepspeed.sh 结合deepspeed微调roberta,获得更快的运算速度 +* demo_classification_afqmc_erlangshen_offload.sh 仅需7G显存即可微调我们效果最好的二郎神系列模型 + + 上述示例均采用AFQMC的数据集,关于数据集的介绍可以在[这里](https://www.cluebenchmarks.com/introduce.html)找到。 + 同时我们处理过的数据文件已经放在Huggingface上,点击[这里](https://huggingface.co./datasets/IDEA-CCNL/AFQMC)直达源文件。 + 仅需要按我们的格式稍微处理一下数据集,即可适配下游不同的分类任务。 + 在脚本示例中,仅需要修改如下参数即可适配本地文件 + + ``` + --dataset_name IDEA-CCNL/AFQMC \ + + -------> 修改为 + + --data_dir $DATA_DIR \ # 数据目录 + --train_data train.json \ # 数据文件 + --valid_data dev.json \ + --test_data test.json \ + + ``` diff --git a/fengshen/__init__.py b/fengshen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5cc52d128218a4878e5778502e25eadf54cf1261 --- /dev/null +++ b/fengshen/__init__.py @@ -0,0 +1,19 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .models.longformer import LongformerConfig, LongformerModel +from .models.roformer import RoFormerConfig, RoFormerModel +from .models.megatron_t5 import T5Config, T5EncoderModel +from .models.ubert import UbertPipelines, UbertModel diff --git a/fengshen/cli/fengshen_pipeline.py b/fengshen/cli/fengshen_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..07c31349ef96fd86d0c14b807601c645b095372f --- /dev/null +++ b/fengshen/cli/fengshen_pipeline.py @@ -0,0 +1,34 @@ +import sys +from importlib import import_module +from datasets import load_dataset +import argparse + + +def main(): + if len(sys.argv) < 3: + raise Exception( + 'args len < 3, example: fengshen_pipeline text_classification predict xxxxx') + pipeline_name = sys.argv[1] + method = sys.argv[2] + pipeline_class = getattr(import_module('fengshen.pipelines.' + pipeline_name), 'Pipeline') + + total_parser = argparse.ArgumentParser("FengShen Pipeline") + total_parser.add_argument('--model', default='', type=str) + total_parser.add_argument('--datasets', default='', type=str) + total_parser.add_argument('--text', default='', type=str) + total_parser = pipeline_class.add_pipeline_specific_args(total_parser) + args = total_parser.parse_args(args=sys.argv[3:]) + pipeline = pipeline_class(args=args, model=args.model) + + if method == 'predict': + print(pipeline(args.text)) + elif method == 'train': + datasets = load_dataset(args.datasets) + pipeline.train(datasets) + else: + raise Exception( + 'cmd not support, now only support {predict, train}') + + +if __name__ == '__main__': + main() diff --git a/fengshen/data/__init__.py b/fengshen/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9bad5790a5799b96f2e164d825c0b1f8ec0c2dfb --- /dev/null +++ b/fengshen/data/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/fengshen/data/bert_dataloader/auto_split.sh b/fengshen/data/bert_dataloader/auto_split.sh new file mode 100644 index 0000000000000000000000000000000000000000..0a0f66d01df8f1728e44d9deb1d37e0396c5143a --- /dev/null +++ b/fengshen/data/bert_dataloader/auto_split.sh @@ -0,0 +1,10 @@ +files=`find $1 -type f -size +1024M` + +for p in $files +do +echo "processing $p" +name=`basename $p .json` +file=`dirname $p` +split -a 2 -C 300M $p $file/$name- && ls|grep -E "(-[a-zA-Z]{2})" |xargs -n1 -i{} mv {} {}.json +rm -f $p +done \ No newline at end of file diff --git a/fengshen/data/bert_dataloader/load.py b/fengshen/data/bert_dataloader/load.py new file mode 100644 index 0000000000000000000000000000000000000000..b36ce8ae72b74e9fd006f087ee0810a306badd7e --- /dev/null +++ b/fengshen/data/bert_dataloader/load.py @@ -0,0 +1,200 @@ +import os +import re +from pathlib import Path +import glob +from tqdm import tqdm +from contextlib import ExitStack +import datasets +import multiprocessing +from typing import cast, TextIO +from itertools import chain +import json +from concurrent.futures import ProcessPoolExecutor +from random import shuffle +from pytorch_lightning import LightningDataModule +from typing import Optional + +from torch.utils.data import DataLoader + + +# _SPLIT_DATA_PATH = '/data1/datas/wudao_180g_split/test' +_SPLIT_DATA_PATH = '/data1/datas/wudao_180g_split' +_CACHE_SPLIT_DATA_PATH = '/data1/datas/wudao_180g_FSData' + +# feats = datasets.Features({"text": datasets.Value('string')}) + + +class BertDataGenerate(object): + + def __init__(self, + data_files=_SPLIT_DATA_PATH, + save_path=_CACHE_SPLIT_DATA_PATH, + train_test_validation='950,49,1', + num_proc=1, + cache=True): + self.data_files = Path(data_files) + if save_path: + self.save_path = Path(save_path) + else: + self.save_path = self.file_check( + Path(self.data_files.parent, self.data_files.name+'_FSDataset'), + 'save') + self.num_proc = num_proc + self.cache = cache + self.split_idx = self.split_train_test_validation_index(train_test_validation) + if cache: + self.cache_path = self.file_check( + Path(self.save_path.parent, 'FSDataCache', self.data_files.name), 'cache') + else: + self.cache_path = None + + @staticmethod + def file_check(path, path_type): + print(path) + if not path.exists(): + path.mkdir(parents=True) + print(f"Since no {path_type} directory is specified, the program will automatically create it in {path} directory.") + return str(path) + + @staticmethod + def split_train_test_validation_index(train_test_validation): + split_idx_ = [int(i) for i in train_test_validation.split(',')] + idx_dict = { + 'train_rate': split_idx_[0]/sum(split_idx_), + 'test_rate': split_idx_[1]/sum(split_idx_[1:]) + } + return idx_dict + + def process(self, index, path): + print('saving dataset shard {}'.format(index)) + + ds = (datasets.load_dataset('json', data_files=str(path), + cache_dir=self.cache_path, + features=None)) + # ds = ds.map(self.cut_sent,input_columns='text') + # print(d) + # print('!!!',ds) + ds = ds['train'].train_test_split(train_size=self.split_idx['train_rate']) + ds_ = ds['test'].train_test_split(train_size=self.split_idx['test_rate']) + ds = datasets.DatasetDict({ + 'train': ds['train'], + 'test': ds_['train'], + 'validation': ds_['test'] + }) + # print('!!!!',ds) + ds.save_to_disk(Path(self.save_path, path.name)) + return 'saving dataset shard {} done'.format(index) + + def generate_cache_arrow(self) -> None: + ''' + 生成HF支持的缓存文件,加速后续的加载 + ''' + data_dict_paths = self.data_files.rglob('*') + p = ProcessPoolExecutor(max_workers=self.num_proc) + res = list() + + for index, path in enumerate(data_dict_paths): + res.append(p.submit(self.process, index, path)) + + p.shutdown(wait=True) + for future in res: + print(future.result(), flush=True) + + +def load_dataset(num_proc=4, **kargs): + cache_dict_paths = Path(_CACHE_SPLIT_DATA_PATH).glob('*') + ds = [] + res = [] + p = ProcessPoolExecutor(max_workers=num_proc) + for path in cache_dict_paths: + res.append(p.submit(datasets.load_from_disk, + str(path), **kargs)) + + p.shutdown(wait=True) + for future in res: + ds.append(future.result()) + # print(future.result()) + train = [] + test = [] + validation = [] + for ds_ in ds: + train.append(ds_['train']) + test.append(ds_['test']) + validation.append(ds_['validation']) + # ds = datasets.concatenate_datasets(ds) + # print(ds) + return datasets.DatasetDict({ + 'train': datasets.concatenate_datasets(train), + 'test': datasets.concatenate_datasets(test), + 'validation': datasets.concatenate_datasets(validation) + }) + + +class BertDataModule(LightningDataModule): + @ staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('Universal DataModule') + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--train_batchsize', default=32, type=int) + parser.add_argument('--val_batchsize', default=32, type=int) + parser.add_argument('--test_batchsize', default=32, type=int) + parser.add_argument('--datasets_name', type=str) + # parser.add_argument('--datasets_name', type=str) + parser.add_argument('--train_datasets_field', type=str, default='train') + parser.add_argument('--val_datasets_field', type=str, default='validation') + parser.add_argument('--test_datasets_field', type=str, default='test') + return parent_args + + def __init__( + self, + tokenizer, + collate_fn, + args, + **kwargs, + ): + super().__init__() + self.datasets = load_dataset(num_proc=args.num_workers) + self.tokenizer = tokenizer + self.collate_fn = collate_fn + self.save_hyperparameters(args) + + def setup(self, stage: Optional[str] = None) -> None: + self.train = DataLoader( + self.datasets[self.hparams.train_datasets_field], + batch_size=self.hparams.train_batchsize, + shuffle=True, + num_workers=self.hparams.num_workers, + collate_fn=self.collate_fn, + ) + self.val = DataLoader( + self.datasets[self.hparams.val_datasets_field], + batch_size=self.hparams.val_batchsize, + shuffle=False, + num_workers=self.hparams.num_workers, + collate_fn=self.collate_fn, + ) + self.test = DataLoader( + self.datasets[self.hparams.test_datasets_field], + batch_size=self.hparams.test_batchsize, + shuffle=False, + num_workers=self.hparams.num_workers, + collate_fn=self.collate_fn, + ) + return + + def train_dataloader(self): + return self.train + + def val_dataloader(self): + return self.val + + def test_dataloader(self): + return self.test + + +if __name__ == '__main__': + # pre = PreProcessing(_SPLIT_DATA_PATH) + # pre.processing() + + dataset = BertDataGenerate(_SPLIT_DATA_PATH, num_proc=16) + dataset.generate_cache_arrow() diff --git a/fengshen/data/bert_dataloader/preprocessing.py b/fengshen/data/bert_dataloader/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..c40e39a8122a5cc4ebd57b558f451c371f6066a3 --- /dev/null +++ b/fengshen/data/bert_dataloader/preprocessing.py @@ -0,0 +1,110 @@ +import re +import json +import multiprocessing +from tqdm import tqdm +from pathlib import Path +from itertools import chain + +_SPLIT_DATA_PATH = '/data1/datas/wudao_180g' + + +def cut_sent(path): + """ + 中文分句,默认?、。、!、省略号分句,考虑双引号包裹的句子 + 采用分割替换的方式 + """ + path = Path(path) + # print(path) + save_path = str(Path('/data1/datas/wudao_180g_split', path.name)) + print('处理文件:', save_path) + with open(save_path, 'wt', encoding='utf-8') as w: + with open(path, 'rt', encoding='utf-8') as f: + for para in tqdm(f): + para = json.loads(para) + para_ = para['text'] + ' ' + # print('sentence piece......') + # pep8中 正则不能些 \? 要写成\\? + para_ = re.sub('([?。!\\?\\!…]+)([^”’]|[”’])', + r'\1#####\2', para_) + para_ = re.sub('([\\.]{3,})([^”’])', r'\1#####\2', para_) + + # 匹配 \1: 句子结束符紧挨’” \2: 非句子结束符号,被引号包裹的句子 + para_ = re.sub( + '([。!?\\?\\!…][”’])([^,。!?\\?\\!]|\\s)', r'\1#####\2', para_) + para_ = re.sub( + '([\\.]{3,}[”’])([^,。!?\\?\\!]|\\s)', r'\1#####\2', para_) + para_ = re.sub( + '([#]{5})([”’])([^,。!?\\?\\!])', r'\2#####\3', para_) + para_ = para_.strip() + # 一个512里面多个样本 + line_ = '' + for line in para_.split('#####'): + line = line.strip() + if len(line_) < 512 and len(line) > 0: + line_ += line + else: + w.writelines(json.dumps( + {'text': line_}, ensure_ascii=False)+'\n') + line_ = line + w.writelines(json.dumps( + {'text': line_}, ensure_ascii=False)+'\n') + + +def chain_iter(*filenames): + """ + 将多个文件读成一个迭代器 + """ + reader = [open(file, 'r') for file in filenames] + return chain(*reader) + + +class Config(object): + + def __init__(self, data_path=_SPLIT_DATA_PATH, num_worker=16, split_numb=600000, cut_sentence=True, output_file=None) -> None: + self.data_path = Path(data_path) + self.num_worker = num_worker + self.split_numb = split_numb + self.cut_sentence = cut_sentence + + +def processing1(): + args = Config() + p_ = [str(i) for i in args.data_path.glob('*')] + fin = chain_iter(*p_) + pool = multiprocessing.Pool(args.num_worker) + docs = pool.imap(cut_sent, fin, chunksize=args.num_worker) + + if not Path(args.data_path.parent, args.data_path.name+'_split').exists(): + Path(args.data_path.parent, args.data_path.name+'_split').mkdir() + writer = open(str(Path(args.data_path.parent, args.data_path.name + + '_split', 'sentence_level.json')), 'wt', encoding='utf-8') + for doc in tqdm(docs): + for sentence in doc: + writer.writelines(json.dumps( + {"text": sentence}, ensure_ascii=False)+'\n') + pool.close() + pool.join() + writer.close() + + +if __name__ == '__main__': + from time import process_time, perf_counter + from random import shuffle + st = process_time() + args = Config(num_worker=16) + + if not Path(args.data_path.parent, args.data_path.name+'_split').exists(): + Path(args.data_path.parent, args.data_path.name + + '_split').mkdir(parents=True) + + p_ = [str(i) for i in args.data_path.glob('*')] + # 简单shuffle + shuffle(p_) + + pool = multiprocessing.Pool(args.num_worker) + for item in p_: + pool.apply_async(func=cut_sent, args=(item,)) + pool.close() + pool.join() + cost_time = process_time() - st + print('DONE!! cost time : %.5f' % cost_time) diff --git a/fengshen/data/clip_dataloader/flickr.py b/fengshen/data/clip_dataloader/flickr.py new file mode 100644 index 0000000000000000000000000000000000000000..22155e039f74b49c8a4222a75144a2c134a6d507 --- /dev/null +++ b/fengshen/data/clip_dataloader/flickr.py @@ -0,0 +1,105 @@ +from torch.utils.data import Dataset, DataLoader +from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \ + CenterCrop +from transformers import BertTokenizer +import pytorch_lightning as pl +from PIL import Image +import os + + +class flickr30k_CNA(Dataset): + def __init__(self, img_root_path, + annot_path, + transform=None): + self.images = [] + self.captions = [] + self.labels = [] + self.root = img_root_path + with open(annot_path, 'r') as f: + for line in f: + line = line.strip().split('\t') + key, caption = line[0].split('#')[0], line[1] + img_path = key + '.jpg' + self.images.append(img_path) + self.captions.append(caption) + self.labels.append(key) + self.transforms = transform + self.tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext") + + # NOTE large 模型 + self.context_length = 77 + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img_path = str(self.images[idx]) + image = self.transforms(Image.open(os.path.join(self.root, img_path))) + text = self.tokenizer(str(self.captions[idx]), max_length=self.context_length, + padding='max_length', truncation=True, return_tensors='pt')['input_ids'][0] + label = self.labels[idx] + return image, text, label + + +def _convert_to_rgb(image): + return image.convert('RGB') + + +def image_transform( + image_size: int, + is_train: bool, + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711) +): + normalize = Normalize(mean=mean, std=std) + if is_train: + return Compose([ + RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC), + _convert_to_rgb, + ToTensor(), + normalize, + ]) + else: + return Compose([ + Resize(image_size, interpolation=InterpolationMode.BICUBIC), + CenterCrop(image_size), + _convert_to_rgb, + ToTensor(), + normalize, + ]) + + +class FlickrDataModule(pl.LightningDataModule): + def __init__(self, args): + self.batch_size = args.batch_size + self.train_filename = args.train_filename # NOTE 标注的文件夹 + self.train_root = args.train_root # NOTE 图片地址 + self.val_filename = args.val_filename + self.val_root = args.val_root + self.test_filename = args.test_filename + self.test_root = args.test_root + + self.pretrain_model = args.pretrain_model + self.image_size = 224 + self.prepare_data_per_node = True + self._log_hyperparams = False + self.num_workers = args.num_workers + + def setup(self, stage=None): + # dataset + train_transform = image_transform(224, True) + val_transform = image_transform(224, False) + test_transform = image_transform(224, False) + + self.train_dataset = flickr30k_CNA(self.train_root, self.train_filename, transform=train_transform) + self.val_dataset = flickr30k_CNA(self.val_root, self.val_filename, transform=val_transform) + self.test_dataset = flickr30k_CNA(self.test_root, self.test_filename, transform=test_transform) + + def train_dataloader(self): + return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers) diff --git a/fengshen/data/data_utils/common_utils.py b/fengshen/data/data_utils/common_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3eef10ecb8c73257ab4338a0ea2e7839b82bcc7e --- /dev/null +++ b/fengshen/data/data_utils/common_utils.py @@ -0,0 +1,4 @@ +def padding_to_maxlength(ids, max_length, pad_id): + cur_len = len(ids) + len_diff = max_length - len(ids) + return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff diff --git a/fengshen/data/data_utils/mask_utils.py b/fengshen/data/data_utils/mask_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0009f00272bf6feff1dbd491153332584cb431e1 --- /dev/null +++ b/fengshen/data/data_utils/mask_utils.py @@ -0,0 +1,285 @@ +import collections + +import numpy as np + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def is_start_piece(piece): + """Check if the current word piece is the starting piece (BERT).""" + # When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + return not piece.startswith("##") + + +def create_masked_lm_predictions(tokens, + vocab_id_list, vocab_id_to_token_dict, + masked_lm_prob, + cls_id, sep_id, mask_id, + max_predictions_per_seq, + np_rng, + max_ngrams=3, + do_whole_word_mask=True, + favor_longer_ngram=False, + do_permutation=False, + geometric_dist=False, + masking_style="bert", + zh_tokenizer=None): + """Creates the predictions for the masked LM objective. + Note: Tokens here are vocab ids and not text tokens.""" + ''' + modified from Megatron-LM + Args: + tokens: 输入 + vocab_id_list: 词表token_id_list + vocab_id_to_token_dict: token_id到token字典 + masked_lm_prob:mask概率 + cls_id、sep_id、mask_id:特殊token + max_predictions_per_seq:最大mask个数 + np_rng:mask随机数 + max_ngrams:最大词长度 + do_whole_word_mask:是否做全词掩码 + favor_longer_ngram:优先用长的词 + do_permutation:是否打乱 + geometric_dist:用np_rng.geometric做随机 + masking_style:mask类型 + zh_tokenizer:WWM的分词器,比如用jieba.lcut做分词之类的 + ''' + cand_indexes = [] + # Note(mingdachen): We create a list for recording if the piece is + # the starting piece of current token, where 1 means true, so that + # on-the-fly whole word masking is possible. + token_boundary = [0] * len(tokens) + # 如果没有指定中文分词器,那就直接按##算 + if zh_tokenizer is None: + for (i, token) in enumerate(tokens): + if token == cls_id or token == sep_id: + token_boundary[i] = 1 + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (do_whole_word_mask and len(cand_indexes) >= 1 and + not is_start_piece(vocab_id_to_token_dict[token])): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + if is_start_piece(vocab_id_to_token_dict[token]): + token_boundary[i] = 1 + else: + # 如果指定了中文分词器,那就先用分词器分词,然后再进行判断 + # 获取去掉CLS SEP的原始文本 + raw_tokens = [] + for t in tokens: + if t != cls_id and t != sep_id: + raw_tokens.append(t) + raw_tokens = [vocab_id_to_token_dict[i] for i in raw_tokens] + # 分词然后获取每次字开头的最长词的长度 + word_list = set(zh_tokenizer(''.join(raw_tokens), HMM=True)) + word_length_dict = {} + for w in word_list: + if len(w) < 1: + continue + if w[0] not in word_length_dict: + word_length_dict[w[0]] = len(w) + elif word_length_dict[w[0]] < len(w): + word_length_dict[w[0]] = len(w) + i = 0 + # 从词表里面检索 + while i < len(tokens): + token_id = tokens[i] + token = vocab_id_to_token_dict[token_id] + if len(token) == 0 or token_id == cls_id or token_id == sep_id: + token_boundary[i] = 1 + i += 1 + continue + word_max_length = 1 + if token[0] in word_length_dict: + word_max_length = word_length_dict[token[0]] + j = 0 + word = '' + word_end = i+1 + # 兼容以前##的形式,如果后面的词是##开头的,那么直接把后面的拼到前面当作一个词 + old_style = False + while word_end < len(tokens) and vocab_id_to_token_dict[tokens[word_end]].startswith('##'): + old_style = True + word_end += 1 + if not old_style: + while j < word_max_length and i+j < len(tokens): + cur_token = tokens[i+j] + word += vocab_id_to_token_dict[cur_token] + j += 1 + if word in word_list: + word_end = i+j + cand_indexes.append([p for p in range(i, word_end)]) + token_boundary[i] = 1 + i = word_end + + output_tokens = list(tokens) + + masked_lm_positions = [] + masked_lm_labels = [] + + if masked_lm_prob == 0: + return (output_tokens, masked_lm_positions, + masked_lm_labels, token_boundary) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) + if not geometric_dist: + # Note(mingdachen): + # By default, we set the probilities to favor shorter ngram sequences. + pvals = 1. / np.arange(1, max_ngrams + 1) + pvals /= pvals.sum(keepdims=True) + if favor_longer_ngram: + pvals = pvals[::-1] + # 获取一个ngram的idx,对于每个word,记录他的ngram的word + ngram_indexes = [] + for idx in range(len(cand_indexes)): + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx + n]) + ngram_indexes.append(ngram_index) + + np_rng.shuffle(ngram_indexes) + + (masked_lms, masked_spans) = ([], []) + covered_indexes = set() + for cand_index_set in ngram_indexes: + if len(masked_lms) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes: + continue + + if not geometric_dist: + n = np_rng.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + else: + # Sampling "n" from the geometric distribution and clipping it to + # the max_ngrams. Using p=0.2 default from the SpanBERT paper + # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) + n = min(np_rng.geometric(0.2), max_ngrams) + + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # Note(mingdachen): + # Repeatedly looking for a candidate that does not exceed the + # maximum number of predictions by trying shorter ngrams. + while len(masked_lms) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + masked_token = None + token_id = tokens[index] + if masking_style == "bert": + # 80% of the time, replace with [MASK] + if np_rng.random() < 0.8: + masked_token = mask_id + else: + # 10% of the time, keep original + if np_rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))] + elif masking_style == "t5": + masked_token = mask_id + else: + raise ValueError("invalid value of masking style") + + output_tokens[index] = masked_token + masked_lms.append(MaskedLmInstance(index=index, label=token_id)) + + masked_spans.append(MaskedLmInstance( + index=index_set, + label=[tokens[index] for index in index_set])) + + assert len(masked_lms) <= num_to_predict + np_rng.shuffle(ngram_indexes) + + select_indexes = set() + if do_permutation: + for cand_index_set in ngram_indexes: + if len(select_indexes) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes or index in select_indexes: + continue + + n = np.random.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + + while len(select_indexes) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(select_indexes) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes or index in select_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + select_indexes.add(index) + assert len(select_indexes) <= num_to_predict + + select_indexes = sorted(select_indexes) + permute_indexes = list(select_indexes) + np_rng.shuffle(permute_indexes) + orig_token = list(output_tokens) + + for src_i, tgt_i in zip(select_indexes, permute_indexes): + output_tokens[src_i] = orig_token[tgt_i] + masked_lms.append(MaskedLmInstance(index=src_i, label=orig_token[src_i])) + + masked_lms = sorted(masked_lms, key=lambda x: x.index) + # Sort the spans by the index of the first span + masked_spans = sorted(masked_spans, key=lambda x: x.index[0]) + + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans) diff --git a/fengshen/data/data_utils/sentence_split.py b/fengshen/data/data_utils/sentence_split.py new file mode 100644 index 0000000000000000000000000000000000000000..1a25e4b51b13f86f4a8a6b39f497f85c050856b6 --- /dev/null +++ b/fengshen/data/data_utils/sentence_split.py @@ -0,0 +1,35 @@ +import re + + +class ChineseSentenceSplitter(object): + def merge_symmetry(self, sentences, symmetry=('“', '”')): + # '''合并对称符号,如双引号''' + effective_ = [] + merged = True + for index in range(len(sentences)): + if symmetry[0] in sentences[index] and symmetry[1] not in sentences[index]: + merged = False + effective_.append(sentences[index]) + elif symmetry[1] in sentences[index] and not merged: + merged = True + effective_[-1] += sentences[index] + elif symmetry[0] not in sentences[index] and symmetry[1] not in sentences[index] and not merged: + effective_[-1] += sentences[index] + else: + effective_.append(sentences[index]) + return [i.strip() for i in effective_ if len(i.strip()) > 0] + + def to_sentences(self, paragraph): + # """由段落切分成句子""" + sentences = re.split(r"(?|。|[!]+|!|\…\…)", paragraph) + sentences.append("") + sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])] + sentences = [i.strip() for i in sentences if len(i.strip()) > 0] + for j in range(1, len(sentences)): + if sentences[j][0] == '”': + sentences[j-1] = sentences[j-1] + '”' + sentences[j] = sentences[j][1:] + return self.merge_symmetry(sentences) + + def tokenize(self, text): + return self.to_sentences(text) diff --git a/fengshen/data/data_utils/sop_utils.py b/fengshen/data/data_utils/sop_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..505f14dca99638b10eee0a4017447401a71ef083 --- /dev/null +++ b/fengshen/data/data_utils/sop_utils.py @@ -0,0 +1,32 @@ + +# copy from megatron +def get_a_and_b_segments(sample, np_rng): + """Divide sample into a and b segments.""" + + # Number of sentences in the sample. + n_sentences = len(sample) + # Make sure we always have two sentences. + assert n_sentences > 1, 'make sure each sample has at least two sentences.' + + # First part: + # `a_end` is how many sentences go into the `A`. + a_end = 1 + if n_sentences >= 3: + # Note that randin in numpy is exclusive. + a_end = np_rng.randint(1, n_sentences) + tokens_a = [] + for j in range(a_end): + tokens_a.extend(sample[j]) + + # Second part: + tokens_b = [] + for j in range(a_end, n_sentences): + tokens_b.extend(sample[j]) + + # Random next: + is_next_random = False + if np_rng.random() < 0.5: + is_next_random = True + tokens_a, tokens_b = tokens_b, tokens_a + + return tokens_a, tokens_b, is_next_random diff --git a/fengshen/data/data_utils/token_type_utils.py b/fengshen/data/data_utils/token_type_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3b805d23b9aa4cda495d3b76ecba7effdc2854eb --- /dev/null +++ b/fengshen/data/data_utils/token_type_utils.py @@ -0,0 +1,25 @@ +def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id): + """Merge segments A and B, add [CLS] and [SEP] and build tokentypes.""" + + tokens = [] + tokentypes = [] + # [CLS]. + tokens.append(cls_id) + tokentypes.append(0) + # Segment A. + for token in tokens_a: + tokens.append(token) + tokentypes.append(0) + # [SEP]. + tokens.append(sep_id) + tokentypes.append(0) + # Segment B. + for token in tokens_b: + tokens.append(token) + tokentypes.append(1) + if tokens_b: + # [SEP]. + tokens.append(sep_id) + tokentypes.append(1) + + return tokens, tokentypes diff --git a/fengshen/data/data_utils/truncate_utils.py b/fengshen/data/data_utils/truncate_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ba4c6b653762c01a26da1bea9cb3d3cbeec08fd7 --- /dev/null +++ b/fengshen/data/data_utils/truncate_utils.py @@ -0,0 +1,19 @@ + +def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng): + """Truncates a pair of sequences to a maximum sequence length.""" + # print(len_a, len_b, max_num_tokens) + assert len_a > 0 + if len_a + len_b <= max_num_tokens: + return False + while len_a + len_b > max_num_tokens: + if len_a > len_b: + len_a -= 1 + tokens = tokens_a + else: + len_b -= 1 + tokens = tokens_b + if np_rng.random() < 0.5: + del tokens[0] + else: + tokens.pop() + return True diff --git a/fengshen/data/dreambooth_datasets/dreambooth_datasets.py b/fengshen/data/dreambooth_datasets/dreambooth_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..6f94216f3dadbd5423dfdb53fe1b2ff9382fb4d5 --- /dev/null +++ b/fengshen/data/dreambooth_datasets/dreambooth_datasets.py @@ -0,0 +1,183 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : dreambooth_datasets.py +@Time : 2022/11/10 00:20 +@Author : Gan Ruyi +@Version : 1.0 +@Contact : ganruyi@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' +from torch.utils.data import Dataset +from torchvision import transforms +from PIL import Image +from pathlib import Path + + +def add_data_args(parent_args): + parser = parent_args.add_argument_group('taiyi stable diffusion data args') + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=True, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If not have enough images, additional images will be" + " sampled with class_prompt." + ), + ) + parser.add_argument( + "--resolution", type=int, default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", default=False, + help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + return parent_args + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_data_dir, + instance_prompt, + tokenizer, + class_data_dir=None, + class_prompt=None, + size=512, + center_crop=False, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + + self.instance_data_dir = Path(instance_data_dir) + if not self.instance_data_dir.exists(): + raise ValueError("Instance images root doesn't exists.") + + self.instance_images_path = list(Path(instance_data_dir).iterdir()) + print(self.instance_images_path) + self.num_instance_images = len(self.instance_images_path) + self.instance_prompt = instance_prompt + self._length = self.num_instance_images + + if class_data_dir is not None: + self.class_data_dir = Path(class_data_dir) + self.class_data_dir.mkdir(parents=True, exist_ok=True) + self.class_images_path = list(self.class_data_dir.iterdir()) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.class_prompt = class_prompt + else: + self.class_data_dir = None + + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index): + example = {} + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["instance_images"] = self.image_transforms(instance_image) + example["instance_prompt_ids"] = self.tokenizer( + self.instance_prompt, + padding="do_not_pad", + truncation=True, + max_length=64, + # max_length=self.tokenizer.model_max_length, + ).input_ids + + if self.class_data_dir: + class_image = Image.open(self.class_images_path[index % self.num_class_images]) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_prompt_ids"] = self.tokenizer( + self.class_prompt, + padding="do_not_pad", + truncation=True, + # max_length=self.tokenizer.model_max_length, + max_length=64, + ).input_ids + + return example + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example diff --git a/fengshen/data/hubert/hubert_dataset.py b/fengshen/data/hubert/hubert_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d8eaa25a5238740cc86a05af257aa3e0996f1499 --- /dev/null +++ b/fengshen/data/hubert/hubert_dataset.py @@ -0,0 +1,361 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +import logging +import os +import sys +from typing import Any, List, Optional, Union + +import numpy as np + +import torch +import torch.nn.functional as F +from fairseq.data import data_utils +from fairseq.data.fairseq_dataset import FairseqDataset + +logger = logging.getLogger(__name__) + + +def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('Hubert Dataset') + parser.add_argument('--data', type=str) + parser.add_argument('--sample_rate', type=float, default=16000) + parser.add_argument('--label_dir', type=str) + parser.add_argument('--labels', type=str, nargs='+') + parser.add_argument('--label_rate', type=float) + parser.add_argument('--max_keep_size', type=int, default=None) + parser.add_argument('--min_sample_size', type=int) + parser.add_argument('--max_sample_size', type=int) + parser.add_argument('--pad_audio', type=bool) + parser.add_argument('--normalize', type=bool) + parser.add_argument('--random_crop', type=bool) + parser.add_argument('--single_target', type=bool, default=False) + return parent_args + + +def load_audio(manifest_path, max_keep, min_keep): + n_long, n_short = 0, 0 + names, inds, sizes = [], [], [] + with open(manifest_path) as f: + root = f.readline().strip() + for ind, line in enumerate(f): + items = line.strip().split("\t") + assert len(items) == 2, line + sz = int(items[1]) + if min_keep is not None and sz < min_keep: + n_short += 1 + elif max_keep is not None and sz > max_keep: + n_long += 1 + else: + names.append(items[0]) + inds.append(ind) + sizes.append(sz) + tot = ind + 1 + logger.info( + ( + f"max_keep={max_keep}, min_keep={min_keep}, " + f"loaded {len(names)}, skipped {n_short} short and {n_long} long, " + f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}" + ) + ) + return root, names, inds, tot, sizes + + +def load_label(label_path, inds, tot): + with open(label_path) as f: + labels = [line.rstrip() for line in f] + assert ( + len(labels) == tot + ), f"number of labels does not match ({len(labels)} != {tot})" + labels = [labels[i] for i in inds] + return labels + + +def load_label_offset(label_path, inds, tot): + with open(label_path) as f: + code_lengths = [len(line.encode("utf-8")) for line in f] + assert ( + len(code_lengths) == tot + ), f"number of labels does not match ({len(code_lengths)} != {tot})" + offsets = list(itertools.accumulate([0] + code_lengths)) + offsets = [(offsets[i], offsets[i + 1]) for i in inds] + return offsets + + +def verify_label_lengths( + audio_sizes, + audio_rate, + label_path, + label_rate, + inds, + tot, + tol=0.1, # tolerance in seconds +): + if label_rate < 0: + logger.info(f"{label_path} is sequence label. skipped") + return + + with open(label_path) as f: + lengths = [len(line.rstrip().split()) for line in f] + assert len(lengths) == tot + lengths = [lengths[i] for i in inds] + num_invalid = 0 + for i, ind in enumerate(inds): + dur_from_audio = audio_sizes[i] / audio_rate + dur_from_label = lengths[i] / label_rate + if abs(dur_from_audio - dur_from_label) > tol: + logger.warning( + ( + f"audio and label duration differ too much " + f"(|{dur_from_audio} - {dur_from_label}| > {tol}) " + f"in line {ind+1} of {label_path}. Check if `label_rate` " + f"is correctly set (currently {label_rate}). " + f"num. of samples = {audio_sizes[i]}; " + f"label length = {lengths[i]}" + ) + ) + num_invalid += 1 + if num_invalid > 0: + logger.warning( + f"total {num_invalid} (audio, label) pairs with mismatched lengths" + ) + + +class HubertDataset(FairseqDataset): + def __init__( + self, + manifest_path: str, + sample_rate: float, + label_paths: List[str], + label_rates: Union[List[float], float], # -1 for sequence labels + pad_list: List[str], + eos_list: List[str], + label_processors: Optional[List[Any]] = None, + max_keep_sample_size: Optional[int] = None, + min_keep_sample_size: Optional[int] = None, + max_sample_size: Optional[int] = None, + shuffle: bool = True, + pad_audio: bool = False, + normalize: bool = False, + store_labels: bool = True, + random_crop: bool = False, + single_target: bool = False, + ): + self.audio_root, self.audio_names, inds, tot, self.sizes = load_audio( + manifest_path, max_keep_sample_size, min_keep_sample_size + ) + self.sample_rate = sample_rate + self.shuffle = shuffle + self.random_crop = random_crop + + self.num_labels = len(label_paths) + self.pad_list = pad_list + self.eos_list = eos_list + self.label_processors = label_processors + self.single_target = single_target + self.label_rates = ( + [label_rates for _ in range(len(label_paths))] + if isinstance(label_rates, float) + else label_rates + ) + self.store_labels = store_labels + if store_labels: + self.label_list = [load_label(p, inds, tot) for p in label_paths] + else: + self.label_paths = label_paths + self.label_offsets_list = [ + load_label_offset(p, inds, tot) for p in label_paths + ] + assert label_processors is None or len(label_processors) == self.num_labels + for label_path, label_rate in zip(label_paths, self.label_rates): + verify_label_lengths( + self.sizes, sample_rate, label_path, label_rate, inds, tot + ) + + self.max_sample_size = ( + max_sample_size if max_sample_size is not None else sys.maxsize + ) + self.pad_audio = pad_audio + self.normalize = normalize + logger.info( + f"pad_audio={pad_audio}, random_crop={random_crop}, " + f"normalize={normalize}, max_sample_size={self.max_sample_size}" + ) + + def get_audio(self, index): + import soundfile as sf + + wav_path = os.path.join(self.audio_root, self.audio_names[index]) + wav, cur_sample_rate = sf.read(wav_path) + wav = torch.from_numpy(wav).float() + wav = self.postprocess(wav, cur_sample_rate) + return wav + + def get_label(self, index, label_idx): + if self.store_labels: + label = self.label_list[label_idx][index] + else: + with open(self.label_paths[label_idx]) as f: + offset_s, offset_e = self.label_offsets_list[label_idx][index] + f.seek(offset_s) + label = f.read(offset_e - offset_s) + + if self.label_processors is not None: + label = self.label_processors[label_idx](label) + return label + + def get_labels(self, index): + return [self.get_label(index, i) for i in range(self.num_labels)] + + def __getitem__(self, index): + wav = self.get_audio(index) + labels = self.get_labels(index) + return {"id": index, "source": wav, "label_list": labels} + + def __len__(self): + return len(self.sizes) + + def crop_to_max_size(self, wav, target_size): + size = len(wav) + diff = size - target_size + if diff <= 0: + return wav, 0 + + start, end = 0, target_size + if self.random_crop: + start = np.random.randint(0, diff + 1) + end = size - diff + start + return wav[start:end], start + + def collater(self, samples): + # target = max(sizes) -> random_crop not used + # target = max_sample_size -> random_crop used for long + samples = [s for s in samples if s["source"] is not None] + if len(samples) == 0: + return {} + + audios = [s["source"] for s in samples] + audio_sizes = [len(s) for s in audios] + if self.pad_audio: + audio_size = min(max(audio_sizes), self.max_sample_size) + else: + audio_size = min(min(audio_sizes), self.max_sample_size) + collated_audios, padding_mask, audio_starts = self.collater_audio( + audios, audio_size + ) + + targets_by_label = [ + [s["label_list"][i] for s in samples] for i in range(self.num_labels) + ] + targets_list, lengths_list, ntokens_list = self.collater_label( + targets_by_label, audio_size, audio_starts + ) + + net_input = {"source": collated_audios, "padding_mask": padding_mask} + batch = { + "id": torch.LongTensor([s["id"] for s in samples]), + "net_input": net_input, + } + + if self.single_target: + batch["target_lengths"] = lengths_list[0] + batch["ntokens"] = ntokens_list[0] + batch["target"] = targets_list[0] + else: + batch["target_lengths_list"] = lengths_list + batch["ntokens_list"] = ntokens_list + batch["target_list"] = targets_list + return batch + + def collater_audio(self, audios, audio_size): + collated_audios = audios[0].new_zeros(len(audios), audio_size) + padding_mask = ( + torch.BoolTensor(collated_audios.shape).fill_(False) + # if self.pad_audio else None + ) + audio_starts = [0 for _ in audios] + for i, audio in enumerate(audios): + diff = len(audio) - audio_size + if diff == 0: + collated_audios[i] = audio + elif diff < 0: + assert self.pad_audio + collated_audios[i] = torch.cat([audio, audio.new_full((-diff,), 0.0)]) + padding_mask[i, diff:] = True + else: + collated_audios[i], audio_starts[i] = self.crop_to_max_size( + audio, audio_size + ) + return collated_audios, padding_mask, audio_starts + + def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad): + assert label_rate > 0 + s2f = label_rate / self.sample_rate + frm_starts = [int(round(s * s2f)) for s in audio_starts] + frm_size = int(round(audio_size * s2f)) + if not self.pad_audio: + rem_size = [len(t) - s for t, s in zip(targets, frm_starts)] + frm_size = min(frm_size, *rem_size) + targets = [t[s: s + frm_size] for t, s in zip(targets, frm_starts)] + logger.debug(f"audio_starts={audio_starts}") + logger.debug(f"frame_starts={frm_starts}") + logger.debug(f"frame_size={frm_size}") + + lengths = torch.LongTensor([len(t) for t in targets]) + ntokens = lengths.sum().item() + targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False) + return targets, lengths, ntokens + + def collater_seq_label(self, targets, pad): + lengths = torch.LongTensor([len(t) for t in targets]) + ntokens = lengths.sum().item() + targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False) + return targets, lengths, ntokens + + def collater_label(self, targets_by_label, audio_size, audio_starts): + targets_list, lengths_list, ntokens_list = [], [], [] + itr = zip(targets_by_label, self.label_rates, self.pad_list) + for targets, label_rate, pad in itr: + if label_rate == -1.0: + targets, lengths, ntokens = self.collater_seq_label(targets, pad) + else: + targets, lengths, ntokens = self.collater_frm_label( + targets, audio_size, audio_starts, label_rate, pad + ) + targets_list.append(targets) + lengths_list.append(lengths) + ntokens_list.append(ntokens) + return targets_list, lengths_list, ntokens_list + + def num_tokens(self, index): + return self.size(index) + + def size(self, index): + if self.pad_audio: + return self.sizes[index] + return min(self.sizes[index], self.max_sample_size) + + def ordered_indices(self): + if self.shuffle: + order = [np.random.permutation(len(self))] + else: + order = [np.arange(len(self))] + + order.append(self.sizes) + return np.lexsort(order)[::-1] + + def postprocess(self, wav, cur_sample_rate): + if wav.dim() == 2: + wav = wav.mean(-1) + assert wav.dim() == 1, wav.dim() + + if cur_sample_rate != self.sample_rate: + raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}") + + if self.normalize: + with torch.no_grad(): + wav = F.layer_norm(wav, wav.shape) + return wav diff --git a/fengshen/data/megatron_dataloader/Makefile b/fengshen/data/megatron_dataloader/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8f9db7686696fbea6c94b998db4b40ef426c748d --- /dev/null +++ b/fengshen/data/megatron_dataloader/Makefile @@ -0,0 +1,9 @@ +CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color +CPPFLAGS += $(shell python3 -m pybind11 --includes) +LIBNAME = helpers +LIBEXT = $(shell python3-config --extension-suffix) + +default: $(LIBNAME)$(LIBEXT) + +%$(LIBEXT): %.cpp + $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ diff --git a/fengshen/data/megatron_dataloader/__init__.py b/fengshen/data/megatron_dataloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cd5f898c6bdf89c6cf0243af102d04f6efed86b8 --- /dev/null +++ b/fengshen/data/megatron_dataloader/__init__.py @@ -0,0 +1 @@ +from . import indexed_dataset diff --git a/fengshen/data/megatron_dataloader/bart_dataset.py b/fengshen/data/megatron_dataloader/bart_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..82a22aa21eba9ac4794305c72efe3c25e2bdefb7 --- /dev/null +++ b/fengshen/data/megatron_dataloader/bart_dataset.py @@ -0,0 +1,443 @@ +"""BART Style dataset. Modified from fairseq.""" + +import numpy as np +import torch +import math +import re + +from fengshen.data.megatron_dataloader.dataset_utils import ( + get_samples_mapping +) + + +class BartDataset(torch.utils.data.Dataset): + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, masked_lm_prob, + max_seq_length, short_seq_prob, seed, tokenizer, zh_tokenizer): + + # Params to store. + self.name = name + self.seed = seed + self.masked_lm_prob = masked_lm_prob + self.max_seq_length = max_seq_length + + # Dataset. + self.indexed_dataset = indexed_dataset + + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 3, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + + # Vocab stuff. + self.vocab_size = tokenizer.vocab_size + inv_vocab = {v: k for k, v in tokenizer.vocab.items()} + self.vocab_id_list = list(inv_vocab.keys()) + self.vocab_id_to_token_dict = inv_vocab + self.cls_id = tokenizer.cls_token_id + self.sep_id = tokenizer.sep_token_id + self.mask_id = tokenizer.mask_token_id + self.pad_id = tokenizer.pad_token_id + self.tokenizer = tokenizer + + seg_tokens = ['。', ';', ';', '!', '!', '?', '?'] + seg_token_ids = [] + for t in seg_tokens: + if t in tokenizer.vocab: + seg_token_ids.append(tokenizer.vocab[t]) + else: + print('seg_token "{}" not in vocab'.format(t)) + self.seg_token_ids = set(seg_token_ids) + + self.zh_tokenizer = zh_tokenizer + + # Denoising ratios + self.permute_sentence_ratio = 1.0 + self.mask_ratio = masked_lm_prob # 0.15 + self.random_ratio = 0.1 + self.insert_ratio = 0.0 + self.rotate_ratio = 0.0 + self.mask_whole_word = 1 + self.item_transform_func = None + + self.mask_span_distribution = None + if False: + _lambda = 3 # Poisson lambda + + lambda_to_the_k = 1 + e_to_the_minus_lambda = math.exp(-_lambda) + k_factorial = 1 + ps = [] + for k in range(0, 128): + ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial) + lambda_to_the_k *= _lambda + k_factorial *= k + 1 + if ps[-1] < 0.0000001: + break + ps = torch.FloatTensor(ps) + self.mask_span_distribution = torch.distributions.Categorical(ps) + + def __len__(self): + return self.samples_mapping.shape[0] + + def __getitem__(self, idx): + start_idx, end_idx, seq_length = self.samples_mapping[idx] + sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)] + # Note that this rng state should be numpy and not python since + # python randint is inclusive whereas the numpy one is exclusive. + # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1 + np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32)) + return self.build_training_sample(sample, self.max_seq_length, np_rng) + + def build_training_sample(self, sample, max_seq_length, np_rng): + """Biuld training sample. + + Arguments: + sample: A list of sentences in which each sentence is a list token ids. + max_seq_length: Desired sequence length. + np_rng: Random number genenrator. Note that this rng state should be + numpy and not python since python randint is inclusive for + the opper bound whereas the numpy one is exclusive. + """ + # permute sentences + full_stops = [] + tokens = [self.cls_id] + for sent in sample: + for t in sent: + token = self.vocab_id_to_token_dict[t] + if len(re.findall('##[\u4E00-\u9FA5]', token)) > 0: + # 兼容erlangshen ##的方式做whole word mask + t = self.tokenizer.convert_tokens_to_ids(token[2:]) + tokens.append(t) + if t in self.seg_token_ids: + tokens.append(self.sep_id) + if tokens[-1] != self.sep_id: + tokens.append(self.sep_id) + + if len(tokens) > max_seq_length: + tokens = tokens[:max_seq_length] + tokens[-1] = self.sep_id + tokens = torch.LongTensor(tokens) + full_stops = (tokens == self.sep_id).long() + assert (max_seq_length - tokens.shape[0]) >= 0, (tokens.size(), tokens[-1], max_seq_length) + + source, target = tokens, tokens[1:].clone() + use_decoder = 1 + # if torch.rand(1).item() < 0.5: + # use_decoder = 0 + + if self.permute_sentence_ratio > 0.0 and use_decoder == 1: + source = self.permute_sentences(source, full_stops, self.permute_sentence_ratio) + + if self.mask_ratio > 0.0: + replace_length = 1 if use_decoder else -1 + mask_ratio = self.mask_ratio * 2 if use_decoder else self.mask_ratio + source = self.add_whole_word_mask(source, mask_ratio, replace_length) + + if self.insert_ratio > 0.0: + raise NotImplementedError + source = self.add_insertion_noise(source, self.insert_ratio) + + if self.rotate_ratio > 0.0 and np.random.random() < self.rotate_ratio: + raise NotImplementedError + source = self.add_rolling_noise(source) + + # there can additional changes to make: + if self.item_transform_func is not None: + source, target = self.item_transform_func(source, target) + + assert (source >= 0).all() + # assert (source[1:-1] >= 1).all() + assert (source <= self.vocab_size).all() + assert source[0] == self.cls_id + assert source[-1] == self.sep_id + + # tokenizer = get_tokenizer() + # print(' '.join(tokenizer.tokenizer.convert_ids_to_tokens(source))) + # print(tokenizer.detokenize(target)) + # print(tokenizer.detokenize(source)) + # print() + + prev_output_tokens = torch.zeros_like(target) + prev_output_tokens[0] = self.sep_id # match the preprocessing in fairseq + prev_output_tokens[1:] = target[:-1] + + # src_padding_length = max_seq_length - source.shape[0] + # tgt_padding_length = max_seq_length - target.shape[0] + # assert src_padding_length >= 0, (source.size(), source[-1], max_seq_length) + # assert tgt_padding_length >= 0, (target.size(), target[-1], max_seq_length) + source_ = torch.full((max_seq_length,), self.pad_id, dtype=torch.long) + source_[:source.shape[0]] = source + target_ = torch.full((max_seq_length,), -100, dtype=torch.long) + # decoder not need bos in the front + target_[:target.shape[0]] = target + prev_output_tokens_ = torch.full((max_seq_length,), self.pad_id, dtype=torch.long) + prev_output_tokens_[:prev_output_tokens.shape[0]] = prev_output_tokens + + return { + "input_ids": source_, + "labels": target_, + # "decoder_input_ids": prev_output_tokens_, + "attention_mask": (source_ != self.pad_id).long() + } + + def permute_sentences(self, source, full_stops, p=1.0): + # Tokens that are full stops, where the previous token is not + sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2 + result = source.clone() + + num_sentences = sentence_ends.size(0) + num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0) + substitutions = torch.randperm(num_sentences)[:num_to_permute] + ordering = torch.arange(0, num_sentences) + ordering[substitutions] = substitutions[torch.randperm(num_to_permute)] + + # Ignore at start + index = 1 + for i in ordering: + sentence = source[(sentence_ends[i - 1] if i > 0 else 1): sentence_ends[i]] + result[index: index + sentence.size(0)] = sentence + index += sentence.size(0) + return result + + def word_starts_en(self, source): + if self.mask_whole_word is not None: + is_word_start = self.mask_whole_word.gather(0, source) + else: + is_word_start = torch.ones(source.size()) + is_word_start[0] = 0 + is_word_start[-1] = 0 + return is_word_start + + def word_starts(self, source): + if self.mask_whole_word is None: + is_word_start = torch.ones(source.size()) + is_word_start[0] = 0 + is_word_start[-1] = 0 + return is_word_start + raw_tokens = [self.vocab_id_to_token_dict[i] for i in source.tolist()] + words = [raw_tokens[0]] + \ + self.zh_tokenizer(''.join(raw_tokens[1:-1]), HMM=True) + [raw_tokens[-1]] + + def _is_chinese_char(c): + """Checks whether CP is the #codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if len(c) > 1: + return all([_is_chinese_char(c_i) for c_i in c]) + cp = ord(c) + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def align_linear(atokens, btokens): + a2c = [] + c2b = [] + a2b = [] + length = 0 + for tok in atokens: + a2c.append([length + i for i in range(len(tok))]) + length += len(tok) + for i, tok in enumerate(btokens): + c2b.extend([i for _ in range(len(tok))]) + + for i, amap in enumerate(a2c): + bmap = [c2b[ci] for ci in amap] + a2b.append(list(set(bmap))) + return a2b + + raw_to_word_align = align_linear(raw_tokens, words) + is_word_start = torch.zeros(source.size()) + word_starts = [] + skip_cur_word = True + for i in range(1, len(raw_to_word_align)): + if raw_to_word_align[i-1] == raw_to_word_align[i]: + # not a word start, as they align to the same word + if not skip_cur_word and not _is_chinese_char(raw_tokens[i]): + word_starts.pop(-1) + skip_cur_word = True + continue + else: + is_word_start[i] = 1 + if _is_chinese_char(raw_tokens[i]): + word_starts.append(i) + skip_cur_word = False + is_word_start[0] = 0 + is_word_start[-1] = 0 + word_starts = torch.tensor(word_starts).long().view(-1, 1) + return is_word_start, word_starts + + def add_whole_word_mask(self, source, p, replace_length=1): + is_word_start, word_starts = self.word_starts(source) + num_to_mask_word = int(math.ceil(word_starts.size(0) * p)) + num_to_mask_char = int(math.ceil(word_starts.size(0) * p * 0.1)) + num_to_mask = num_to_mask_word + num_to_mask_char + if num_to_mask > word_starts.size(0): + word_starts = is_word_start.nonzero(as_tuple=False) + num_inserts = 0 + if num_to_mask == 0: + return source + + if self.mask_span_distribution is not None: + lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,)) + + # Make sure we have enough to mask + cum_length = torch.cumsum(lengths, 0) + while cum_length[-1] < num_to_mask: + lengths = torch.cat( + [ + lengths, + self.mask_span_distribution.sample(sample_shape=(num_to_mask,)), + ], + dim=0, + ) + cum_length = torch.cumsum(lengths, 0) + + # Trim to masking budget + i = 0 + while cum_length[i] < num_to_mask: + i += 1 + lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1]) + num_to_mask = i + 1 + lengths = lengths[:num_to_mask] + + # Handle 0-length mask (inserts) separately + lengths = lengths[lengths > 0] + num_inserts = num_to_mask - lengths.size(0) + num_to_mask -= num_inserts + if num_to_mask == 0: + return self.add_insertion_noise(source, num_inserts / source.size(0)) + + assert (lengths > 0).all() + else: + lengths = torch.ones((num_to_mask,)).long() + assert is_word_start[-1] == 0 + indices = word_starts[ + torch.randperm(word_starts.size(0))[:num_to_mask] + ].squeeze(1) + mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio + source_length = source.size(0) + assert source_length - 1 not in indices + to_keep = torch.ones(source_length, dtype=torch.bool) + is_word_start[ + -1 + ] = 255 # acts as a long length, so spans don't go over the end of doc + if replace_length == 0: + to_keep[indices] = 0 + else: + # keep index, but replace it with [MASK] + # print(source.size(), word_starts.size(), indices.size(), mask_random.size()) + source[indices] = self.mask_id + source[indices[mask_random]] = torch.randint( + 1, self.vocab_size, size=(mask_random.sum(),) + ) + # sorted_indices = torch.sort(indices)[0] + # continue_mask_pos = ((sorted_indices + 1)[:-1] == sorted_indices[1:]) + # continue_mask_indices = sorted_indices[1:][continue_mask_pos] + # to_keep[continue_mask_indices] = 0 + + # for char indices, we already masked, the following loop handles word mask + indices = indices[:num_to_mask_word] + mask_random = mask_random[:num_to_mask_word] + if self.mask_span_distribution is not None: + assert len(lengths.size()) == 1 + assert lengths.size() == indices.size() + lengths -= 1 + while indices.size(0) > 0: + assert lengths.size() == indices.size() + lengths -= is_word_start[indices + 1].long() + uncompleted = lengths >= 0 + indices = indices[uncompleted] + 1 + mask_random = mask_random[uncompleted] + lengths = lengths[uncompleted] + if replace_length != -1: + # delete token + to_keep[indices] = 0 + else: + # keep index, but replace it with [MASK] + source[indices] = self.mask_id + source[indices[mask_random]] = torch.randint( + 1, self.vocab_size, size=(mask_random.sum(),) + ) + else: + # A bit faster when all lengths are 1 + while indices.size(0) > 0: + uncompleted = is_word_start[indices + 1] == 0 + indices = indices[uncompleted] + 1 + mask_random = mask_random[uncompleted] + if replace_length != -1: + # delete token + to_keep[indices] = 0 + else: + # keep index, but replace it with [MASK] + source[indices] = self.mask_id + source[indices[mask_random]] = torch.randint( + 1, self.vocab_size, size=(mask_random.sum(),) + ) + + assert source_length - 1 not in indices + + source = source[to_keep] + + if num_inserts > 0: + source = self.add_insertion_noise(source, num_inserts / source.size(0)) + + return source + + def add_permuted_noise(self, tokens, p): + num_words = len(tokens) + num_to_permute = math.ceil(((num_words * 2) * p) / 2.0) + substitutions = torch.randperm(num_words - 2)[:num_to_permute] + 1 + tokens[substitutions] = tokens[substitutions[torch.randperm(num_to_permute)]] + return tokens + + def add_rolling_noise(self, tokens): + offset = np.random.randint(1, max(1, tokens.size(-1) - 1) + 1) + tokens = torch.cat( + (tokens[0:1], tokens[offset:-1], tokens[1:offset], tokens[-1:]), + dim=0, + ) + return tokens + + def add_insertion_noise(self, tokens, p): + if p == 0.0: + return tokens + + num_tokens = len(tokens) + n = int(math.ceil(num_tokens * p)) + + noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1 + noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool) + noise_mask[noise_indices] = 1 + result = torch.LongTensor(n + len(tokens)).fill_(-1) + + num_random = int(math.ceil(n * self.random_ratio)) + result[noise_indices[num_random:]] = self.mask_id + result[noise_indices[:num_random]] = torch.randint( + low=1, high=self.vocab_size, size=(num_random,) + ) + + result[~noise_mask] = tokens + + assert (result >= 0).all() + return result diff --git a/fengshen/data/megatron_dataloader/bert_dataset.py b/fengshen/data/megatron_dataloader/bert_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2c007f060fd07fc9c6302b7f88e191469d599222 --- /dev/null +++ b/fengshen/data/megatron_dataloader/bert_dataset.py @@ -0,0 +1,196 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BERT Style dataset.""" + + +import numpy as np +import torch + +from fengshen.data.megatron_dataloader.dataset_utils import ( + get_samples_mapping, + get_a_and_b_segments, + create_masked_lm_predictions, + create_tokens_and_tokentypes, +) + + +class BertDataset(torch.utils.data.Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, masked_lm_prob, + max_seq_length, short_seq_prob, seed, binary_head, tokenizer, masking_style): + # Params to store. + self.name = name + self.seed = seed + self.masked_lm_prob = masked_lm_prob + self.max_seq_length = max_seq_length + self.short_seq_prob = short_seq_prob + self.binary_head = binary_head + self.masking_style = masking_style + + # Dataset. + self.indexed_dataset = indexed_dataset + + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + # account for added tokens + self.max_seq_length - 3, + short_seq_prob, + self.seed, + self.name, + self.binary_head) + inv_vocab = {v: k for k, v in tokenizer.vocab.items()} + self.vocab_id_list = list(inv_vocab.keys()) + self.vocab_id_to_token_dict = inv_vocab + self.cls_id = tokenizer.cls_token_id + self.sep_id = tokenizer.sep_token_id + self.mask_id = tokenizer.mask_token_id + self.pad_id = tokenizer.pad_token_id + self.tokenizer = tokenizer + + def __len__(self): + return self.samples_mapping.shape[0] + + def __getitem__(self, idx): + start_idx, end_idx, seq_length = self.samples_mapping[idx] + sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)] + # Note that this rng state should be numpy and not python since + # python randint is inclusive whereas the numpy one is exclusive. + # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1 + np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32)) + return build_training_sample(sample, seq_length, + self.max_seq_length, # needed for padding + self.vocab_id_list, + self.vocab_id_to_token_dict, + self.cls_id, self.sep_id, + self.mask_id, self.pad_id, + self.masked_lm_prob, np_rng, + self.binary_head, + tokenizer=self.tokenizer, + masking_style=self.masking_style) + + +def build_training_sample(sample, + target_seq_length, max_seq_length, + vocab_id_list, vocab_id_to_token_dict, + cls_id, sep_id, mask_id, pad_id, + masked_lm_prob, np_rng, binary_head, + tokenizer, + masking_style='bert'): + """Biuld training sample. + + Arguments: + sample: A list of sentences in which each sentence is a list token ids. + target_seq_length: Desired sequence length. + max_seq_length: Maximum length of the sequence. All values are padded to + this length. + vocab_id_list: List of vocabulary ids. Used to pick a random id. + vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. + cls_id: Start of example id. + sep_id: Separator id. + mask_id: Mask token id. + pad_id: Padding token id. + masked_lm_prob: Probability to mask tokens. + np_rng: Random number genenrator. Note that this rng state should be + numpy and not python since python randint is inclusive for + the opper bound whereas the numpy one is exclusive. + """ + + if binary_head: + # We assume that we have at least two sentences in the sample + assert len(sample) > 1 + assert target_seq_length <= max_seq_length + + # Divide sample into two segments (A and B). + if binary_head: + tokens_a, tokens_b, is_next_random = get_a_and_b_segments(sample, + np_rng) + else: + tokens_a = [] + for j in range(len(sample)): + tokens_a.extend(sample[j]) + tokens_b = [] + is_next_random = False + + if len(tokens_a) >= max_seq_length-3: + tokens_a = tokens_a[:max_seq_length-3] + + # Truncate to `target_sequence_length`. + max_num_tokens = target_seq_length + '''' + truncated = truncate_segments(tokens_a, tokens_b, len(tokens_a), + len(tokens_b), max_num_tokens, np_rng) + ''' + + # Build tokens and toketypes. + tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b, + cls_id, sep_id) + # Masking. + max_predictions_per_seq = masked_lm_prob * max_num_tokens + (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions( + tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, + tokenizer=tokenizer, + masking_style=masking_style) + + # Padding. + tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np \ + = pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, + masked_labels, pad_id, max_seq_length) + + train_sample = { + 'input_ids': tokens_np, + 'token_type_ids': tokentypes_np, + 'labels': labels_np, + 'next_sentence_label': int(is_next_random), + 'attention_mask': padding_mask_np} + return train_sample + + +def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, + masked_labels, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + assert len(tokentypes) == num_tokens + assert len(masked_positions) == len(masked_labels) + + # Tokens and token types. + filler = [pad_id] * padding_length + tokens_np = np.array(tokens + filler, dtype=np.int64) + tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) + + # Padding mask. + padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, + dtype=np.int64) + + # Lables and loss mask. + labels = [-100] * max_seq_length + loss_mask = [0] * max_seq_length + for i in range(len(masked_positions)): + assert masked_positions[i] < num_tokens + labels[masked_positions[i]] = masked_labels[i] + loss_mask[masked_positions[i]] = 1 + labels_np = np.array(labels, dtype=np.int64) + loss_mask_np = np.array(loss_mask, dtype=np.int64) + + return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np diff --git a/fengshen/data/megatron_dataloader/blendable_dataset.py b/fengshen/data/megatron_dataloader/blendable_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ee24d4056b86333a13d4926e79283a0bc96bbea3 --- /dev/null +++ b/fengshen/data/megatron_dataloader/blendable_dataset.py @@ -0,0 +1,64 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Blendable dataset.""" + +import time + +import numpy as np +import torch + +from fengshen.data.megatron_dataloader.utils import print_rank_0 + + +class BlendableDataset(torch.utils.data.Dataset): + + def __init__(self, datasets, weights): + + self.datasets = datasets + num_datasets = len(datasets) + assert num_datasets == len(weights) + + self.size = 0 + for dataset in self.datasets: + self.size += len(dataset) + + # Normalize weights. + weights = np.array(weights, dtype=np.float64) + sum_weights = np.sum(weights) + assert sum_weights > 0.0 + weights /= sum_weights + + # Build indecies. + start_time = time.time() + assert num_datasets < 255 + self.dataset_index = np.zeros(self.size, dtype=np.uint8) + self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) + + from fengshen.data.megatron_dataloader import helpers + helpers.build_blending_indices(self.dataset_index, + self.dataset_sample_index, + weights, num_datasets, self.size, + torch.distributed.get_rank() == 0) + print_rank_0('> elapsed time for building blendable dataset indices: ' + '{:.2f} (sec)'.format(time.time() - start_time)) + + def __len__(self): + return self.size + + def __getitem__(self, idx): + dataset_idx = self.dataset_index[idx] + sample_idx = self.dataset_sample_index[idx] + return self.datasets[dataset_idx][sample_idx] diff --git a/fengshen/data/megatron_dataloader/dataset_utils.py b/fengshen/data/megatron_dataloader/dataset_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9b579751573ff8ddf94882c032d4ed6cc168ba07 --- /dev/null +++ b/fengshen/data/megatron_dataloader/dataset_utils.py @@ -0,0 +1,788 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors, and NVIDIA. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Most of the code here has been copied from: +# https://github.com/google-research/albert/blob/master/create_pretraining_data.py +# with some modifications. + +import math +import time +import collections + +import numpy as np +import re + +from fengshen.data.megatron_dataloader.utils import ( + print_rank_0 +) +from fengshen.data.megatron_dataloader.blendable_dataset import BlendableDataset +from fengshen.data.megatron_dataloader.indexed_dataset import make_dataset as make_indexed_dataset + +DSET_TYPE_BERT = 'standard_bert' +DSET_TYPE_ICT = 'ict' +DSET_TYPE_T5 = 't5' +DSET_TYPE_BERT_CN_WWM = 'bert_cn_wwm' +DSET_TYPE_BART = 'bart' +DSET_TYPE_COCOLM = 'coco_lm' + +DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, + DSET_TYPE_T5, DSET_TYPE_BERT_CN_WWM, + DSET_TYPE_BART, DSET_TYPE_COCOLM] + + +def get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples): + + # The data prefix should be in the format of: + # weight-1, data-prefix-1, weight-2, data-prefix-2, .. + assert len(data_prefix) % 2 == 0 + num_datasets = len(data_prefix) // 2 + weights = [0] * num_datasets + prefixes = [0] * num_datasets + for i in range(num_datasets): + weights[i] = float(data_prefix[2 * i]) + prefixes[i] = (data_prefix[2 * i + 1]).strip() + # Normalize weights + weight_sum = 0.0 + for weight in weights: + weight_sum += weight + assert weight_sum > 0.0 + weights = [weight / weight_sum for weight in weights] + + # Add 0.5% (the 1.005 factor) so in case the bleding dataset does + # not uniformly distribute the number of samples, we still have + # samples left to feed to the network. + datasets_train_valid_test_num_samples = [] + for weight in weights: + datasets_train_valid_test_num_samples.append( + [int(math.ceil(val * weight * 1.005)) + for val in train_valid_test_num_samples]) + + return prefixes, weights, datasets_train_valid_test_num_samples + + +def compile_helper(): + """Compile helper function ar runtime. Make sure this + is invoked on a single process.""" + import os + import subprocess + path = os.path.abspath(os.path.dirname(__file__)) + ret = subprocess.run(['make', '-C', path]) + if ret.returncode != 0: + print("Making C++ dataset helpers module failed, exiting.") + import sys + sys.exit(1) + + +def get_a_and_b_segments(sample, np_rng): + """Divide sample into a and b segments.""" + + # Number of sentences in the sample. + n_sentences = len(sample) + # Make sure we always have two sentences. + assert n_sentences > 1, 'make sure each sample has at least two sentences.' + + # First part: + # `a_end` is how many sentences go into the `A`. + a_end = 1 + if n_sentences >= 3: + # Note that randin in numpy is exclusive. + a_end = np_rng.randint(1, n_sentences) + tokens_a = [] + for j in range(a_end): + tokens_a.extend(sample[j]) + + # Second part: + tokens_b = [] + for j in range(a_end, n_sentences): + tokens_b.extend(sample[j]) + + # Random next: + is_next_random = False + if np_rng.random() < 0.5: + is_next_random = True + tokens_a, tokens_b = tokens_b, tokens_a + + return tokens_a, tokens_b, is_next_random + + +def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng): + """Truncates a pair of sequences to a maximum sequence length.""" + # print(len_a, len_b, max_num_tokens) + assert len_a > 0 + if len_a + len_b <= max_num_tokens: + return False + while len_a + len_b > max_num_tokens: + if len_a > len_b: + len_a -= 1 + tokens = tokens_a + else: + len_b -= 1 + tokens = tokens_b + if np_rng.random() < 0.5: + del tokens[0] + else: + tokens.pop() + return True + + +def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id): + """Merge segments A and B, add [CLS] and [SEP] and build tokentypes.""" + + tokens = [] + tokentypes = [] + # [CLS]. + tokens.append(cls_id) + tokentypes.append(0) + # Segment A. + for token in tokens_a: + tokens.append(token) + tokentypes.append(0) + # [SEP]. + tokens.append(sep_id) + tokentypes.append(0) + # Segment B. + for token in tokens_b: + tokens.append(token) + tokentypes.append(1) + if tokens_b: + # [SEP]. + tokens.append(sep_id) + tokentypes.append(1) + + return tokens, tokentypes + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def is_start_piece(piece): + """Check if the current word piece is the starting piece (BERT).""" + # When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + return not piece.startswith("##") + + +def create_masked_lm_predictions(tokens, + vocab_id_list, vocab_id_to_token_dict, + masked_lm_prob, + cls_id, sep_id, mask_id, + max_predictions_per_seq, + np_rng, + tokenizer, + max_ngrams=3, + do_whole_word_mask=True, + favor_longer_ngram=False, + do_permutation=False, + geometric_dist=False, + masking_style="bert", + zh_tokenizer=None): + """Creates the predictions for the masked LM objective. + Note: Tokens here are vocab ids and not text tokens.""" + + cand_indexes = [] + # Note(mingdachen): We create a list for recording if the piece is + # the starting piece of current token, where 1 means true, so that + # on-the-fly whole word masking is possible. + token_boundary = [0] * len(tokens) + + # 如果没有指定中文分词器,那就直接按##算 + if zh_tokenizer is None: + for (i, token) in enumerate(tokens): + if token == cls_id or token == sep_id: + token_boundary[i] = 1 + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (do_whole_word_mask and len(cand_indexes) >= 1 and + not is_start_piece(vocab_id_to_token_dict[token])): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + if is_start_piece(vocab_id_to_token_dict[token]): + token_boundary[i] = 1 + else: + # 如果指定了中文分词器,那就先用分词器分词,然后再进行判断 + # 获取去掉CLS SEP的原始文本 + raw_tokens = [] + for t in tokens: + if t != cls_id and t != sep_id: + raw_tokens.append(t) + raw_tokens = [vocab_id_to_token_dict[i] for i in raw_tokens] + # 分词然后获取每次字开头的最长词的长度 + word_list = set(zh_tokenizer(''.join(raw_tokens), HMM=True)) + word_length_dict = {} + for w in word_list: + if len(w) < 1: + continue + if w[0] not in word_length_dict: + word_length_dict[w[0]] = len(w) + elif word_length_dict[w[0]] < len(w): + word_length_dict[w[0]] = len(w) + i = 0 + # 从词表里面检索 + while i < len(tokens): + token_id = tokens[i] + token = vocab_id_to_token_dict[token_id] + if len(token) == 0 or token_id == cls_id or token_id == sep_id: + token_boundary[i] = 1 + i += 1 + continue + word_max_length = 1 + if token[0] in word_length_dict: + word_max_length = word_length_dict[token[0]] + j = 0 + word = '' + word_end = i+1 + # 兼容以前##的形式,如果后面的词是##开头的,那么直接把后面的拼到前面当作一个词 + old_style = False + while word_end < len(tokens) and vocab_id_to_token_dict[tokens[word_end]].startswith('##'): + old_style = True + word_end += 1 + if not old_style: + while j < word_max_length and i+j < len(tokens): + cur_token = tokens[i+j] + word += vocab_id_to_token_dict[cur_token] + j += 1 + if word in word_list: + word_end = i+j + cand_indexes.append([p for p in range(i, word_end)]) + token_boundary[i] = 1 + i = word_end + + output_tokens = list(tokens) + # add by ganruyi + if masking_style == 'bert-cn-wwm': + # if non chinese is False, that means it is chinese + # then try to remove "##" which is added previously + new_token_ids = [] + for token_id in output_tokens: + token = tokenizer.convert_ids_to_tokens([token_id])[0] + if len(re.findall('##[\u4E00-\u9FA5]', token)) > 0: + token = token[2:] + new_token_id = tokenizer.convert_tokens_to_ids([token])[ + 0] + new_token_ids.append(new_token_id) + output_tokens = new_token_ids + + masked_lm_positions = [] + masked_lm_labels = [] + + if masked_lm_prob == 0: + return (output_tokens, masked_lm_positions, + masked_lm_labels, token_boundary) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) + if not geometric_dist: + # Note(mingdachen): + # By default, we set the probilities to favor shorter ngram sequences. + pvals = 1. / np.arange(1, max_ngrams + 1) + pvals /= pvals.sum(keepdims=True) + if favor_longer_ngram: + pvals = pvals[::-1] + # 获取一个ngram的idx,对于每个word,记录他的ngram的word + ngram_indexes = [] + for idx in range(len(cand_indexes)): + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx + n]) + ngram_indexes.append(ngram_index) + + np_rng.shuffle(ngram_indexes) + + (masked_lms, masked_spans) = ([], []) + covered_indexes = set() + for cand_index_set in ngram_indexes: + if len(masked_lms) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes: + continue + + if not geometric_dist: + n = np_rng.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + else: + # Sampling "n" from the geometric distribution and clipping it to + # the max_ngrams. Using p=0.2 default from the SpanBERT paper + # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) + n = min(np_rng.geometric(0.2), max_ngrams) + + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # Note(mingdachen): + # Repeatedly looking for a candidate that does not exceed the + # maximum number of predictions by trying shorter ngrams. + while len(masked_lms) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + masked_token = None + if masking_style == "bert": + # 80% of the time, replace with [MASK] + if np_rng.random() < 0.8: + masked_token = mask_id + else: + # 10% of the time, keep original + if np_rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_id_list[np_rng.randint(0, len(vocab_id_list))] + elif masking_style == 'bert-cn-wwm': + # 80% of the time, replace with [MASK] + if np_rng.random() < 0.8: + masked_token = mask_id + else: + # 10% of the time, keep original + if np_rng.random() < 0.5: + # 如果是中文全词mask,去掉tokens里的## + token_id = tokens[index] + token = tokenizer.convert_ids_to_tokens([token_id])[ + 0] + if len(re.findall('##[\u4E00-\u9FA5]', token)) > 0: + token = token[2:] + new_token_id = tokenizer.convert_tokens_to_ids([token])[ + 0] + masked_token = new_token_id + # 10% of the time, replace with random word + else: + masked_token = vocab_id_list[np_rng.randint( + 0, len(vocab_id_list))] + elif masking_style == "t5": + masked_token = mask_id + else: + raise ValueError("invalid value of masking style") + + output_tokens[index] = masked_token + masked_lms.append(MaskedLmInstance( + index=index, label=tokens[index])) + + masked_spans.append(MaskedLmInstance( + index=index_set, + label=[tokens[index] for index in index_set])) + + assert len(masked_lms) <= num_to_predict + np_rng.shuffle(ngram_indexes) + + select_indexes = set() + if do_permutation: + for cand_index_set in ngram_indexes: + if len(select_indexes) >= num_to_predict: + break + if not cand_index_set: + continue + # Note(mingdachen): + # Skip current piece if they are covered in lm masking or previous ngrams. + for index_set in cand_index_set[0]: + for index in index_set: + if index in covered_indexes or index in select_indexes: + continue + + n = np.random.choice(ngrams[:len(cand_index_set)], + p=pvals[:len(cand_index_set)] / + pvals[:len(cand_index_set)].sum(keepdims=True)) + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + + while len(select_indexes) + len(index_set) > num_to_predict: + if n == 0: + break + index_set = sum(cand_index_set[n - 1], []) + n -= 1 + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(select_indexes) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes or index in select_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + select_indexes.add(index) + assert len(select_indexes) <= num_to_predict + + select_indexes = sorted(select_indexes) + permute_indexes = list(select_indexes) + np_rng.shuffle(permute_indexes) + orig_token = list(output_tokens) + + for src_i, tgt_i in zip(select_indexes, permute_indexes): + output_tokens[src_i] = orig_token[tgt_i] + masked_lms.append(MaskedLmInstance( + index=src_i, label=orig_token[src_i])) + + masked_lms = sorted(masked_lms, key=lambda x: x.index) + # Sort the spans by the index of the first span + masked_spans = sorted(masked_spans, key=lambda x: x.index[0]) + + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary, masked_spans) + + +def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, + masked_labels, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + assert len(tokentypes) == num_tokens + assert len(masked_positions) == len(masked_labels) + + # Tokens and token types. + filler = [pad_id] * padding_length + tokens_np = np.array(tokens + filler, dtype=np.int64) + tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) + + # Padding mask. + padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, + dtype=np.int64) + + # Lables and loss mask. + labels = [-1] * max_seq_length + loss_mask = [0] * max_seq_length + for i in range(len(masked_positions)): + assert masked_positions[i] < num_tokens + labels[masked_positions[i]] = masked_labels[i] + loss_mask[masked_positions[i]] = 1 + labels_np = np.array(labels, dtype=np.int64) + loss_mask_np = np.array(loss_mask, dtype=np.int64) + + return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np + + +def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + max_seq_length, + masked_lm_prob, short_seq_prob, seed, + tokenizer, + skip_warmup, binary_head=False, + max_seq_length_dec=None, + dataset_type='standard_bert', + zh_tokenizer=None, + span=None): + + if len(data_prefix) == 1: + return _build_train_valid_test_datasets(data_prefix[0], + data_impl, splits_string, + train_valid_test_num_samples, + max_seq_length, masked_lm_prob, + short_seq_prob, seed, + skip_warmup, + binary_head, + max_seq_length_dec, + tokenizer, + dataset_type=dataset_type, + zh_tokenizer=zh_tokenizer, + span=span) + # Blending dataset. + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + max_seq_length, masked_lm_prob, short_seq_prob, + seed, skip_warmup, binary_head, max_seq_length_dec, + tokenizer, dataset_type=dataset_type, zh_tokenizer=zh_tokenizer) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights) + + return (blending_train_dataset, blending_valid_dataset, + blending_test_dataset) + + +def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + max_seq_length, + masked_lm_prob, short_seq_prob, seed, + skip_warmup, binary_head, + max_seq_length_dec, + tokenizer, + dataset_type='standard_bert', + zh_tokenizer=None, + span=None): + + if dataset_type not in DSET_TYPES: + raise ValueError("Invalid dataset_type: ", dataset_type) + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + # Get start and end indices of train/valid/train into doc-idx + # Note that doc-idx is desinged to be num-docs + 1 so we can + # easily iterate over it. + total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1 + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + start_index = indexed_dataset.doc_idx[splits[index]] + end_index = indexed_dataset.doc_idx[splits[index + 1]] + print_rank_0(' sentence indices in [{}, {}) total of {} ' + 'sentences'.format(start_index, end_index, + end_index - start_index)) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + from fengshen.data.megatron_dataloader.bert_dataset import BertDataset + from fengshen.data.megatron_dataloader.bart_dataset import BartDataset + from fengshen.data.megatron_dataloader.cocolm_dataset import COCOLMDataset + dataset = None + if splits[index + 1] > splits[index]: + # Get the pointer to the original doc-idx so we can set it later. + doc_idx_ptr = indexed_dataset.get_doc_idx() + # Slice the doc-idx + start_index = splits[index] + # Add +1 so we can index into the dataset to get the upper bound. + end_index = splits[index + 1] + 1 + # New doc_idx view. + indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index]) + # Build the dataset accordingly. + kwargs = dict( + name=name, + data_prefix=data_prefix, + num_epochs=None, + max_num_samples=train_valid_test_num_samples[index], + max_seq_length=max_seq_length, + seed=seed, + ) + + if dataset_type == DSET_TYPE_BERT or dataset_type == DSET_TYPE_BERT_CN_WWM: + dataset = BertDataset( + indexed_dataset=indexed_dataset, + masked_lm_prob=masked_lm_prob, + short_seq_prob=short_seq_prob, + binary_head=binary_head, + # 增加参数区分bert和bert-cn-wwm + tokenizer=tokenizer, + masking_style='bert' if dataset_type == DSET_TYPE_BERT else 'bert-cn-wwm', + **kwargs + ) + elif dataset_type == DSET_TYPE_BART: + dataset = BartDataset( + indexed_dataset=indexed_dataset, + masked_lm_prob=masked_lm_prob, + short_seq_prob=short_seq_prob, + tokenizer=tokenizer, + zh_tokenizer=zh_tokenizer, + **kwargs + ) + elif dataset_type == DSET_TYPE_COCOLM: + dataset = COCOLMDataset( + indexed_dataset=indexed_dataset, + masked_lm_prob=masked_lm_prob, + short_seq_prob=short_seq_prob, + tokenizer=tokenizer, + masking_style='bert', + span=span, + **kwargs + ) + else: + raise NotImplementedError( + "Dataset type not fully implemented.") + + # Set the original pointer so dataset remains the main dataset. + indexed_dataset.set_doc_idx(doc_idx_ptr) + # Checks. + assert indexed_dataset.doc_idx[0] == 0 + assert indexed_dataset.doc_idx.shape[0] == \ + (total_num_of_documents + 1) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def get_indexed_dataset_(data_prefix, data_impl, skip_warmup): + + print_rank_0(' > building dataset index ...') + + start_time = time.time() + indexed_dataset = make_indexed_dataset(data_prefix, + data_impl, + skip_warmup) + assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1] + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + + print_rank_0(' > indexed dataset stats:') + print_rank_0(' number of documents: {}'.format( + indexed_dataset.doc_idx.shape[0] - 1)) + print_rank_0(' number of sentences: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset + + +def get_train_valid_test_split_(splits_string, size): + """ Get dataset splits from comma or '/' separated string list.""" + + splits = [] + if splits_string.find(',') != -1: + splits = [float(s) for s in splits_string.split(',')] + elif splits_string.find('/') != -1: + splits = [float(s) for s in splits_string.split('/')] + else: + splits = [float(splits_string)] + while len(splits) < 3: + splits.append(0.) + splits = splits[:3] + splits_sum = sum(splits) + assert splits_sum > 0.0 + splits = [split / splits_sum for split in splits] + splits_index = [0] + for index, split in enumerate(splits): + splits_index.append(splits_index[index] + + int(round(split * float(size)))) + diff = splits_index[-1] - size + for index in range(1, len(splits_index)): + splits_index[index] -= diff + assert len(splits_index) == 4 + assert splits_index[-1] == size + return splits_index + + +def get_samples_mapping(indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + max_seq_length, + short_seq_prob, + seed, + name, + binary_head): + """Get a list that maps a sample index to a starting + sentence index, end sentence index, and length""" + + if not num_epochs: + if not max_num_samples: + raise ValueError("Need to specify either max_num_samples " + "or num_epochs") + num_epochs = np.iinfo(np.int32).max - 1 + if not max_num_samples: + max_num_samples = np.iinfo(np.int64).max - 1 + + # Filename of the index mapping + indexmap_filename = data_prefix + indexmap_filename += '_{}_indexmap'.format(name) + if num_epochs != (np.iinfo(np.int32).max - 1): + indexmap_filename += '_{}ep'.format(num_epochs) + if max_num_samples != (np.iinfo(np.int64).max - 1): + indexmap_filename += '_{}mns'.format(max_num_samples) + indexmap_filename += '_{}msl'.format(max_seq_length) + indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) + indexmap_filename += '_{}s'.format(seed) + indexmap_filename += '.npy' + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + # ganruyi comment + # counts = torch.cuda.LongTensor([1]) + # torch.distributed.all_reduce( + # counts, group=mpu.get_data_parallel_group()) + # torch.distributed.all_reduce( + # counts, group=mpu.get_pipeline_model_parallel_group()) + # assert counts[0].item() == ( + # torch.distributed.get_world_size() // + # torch.distributed.get_world_size( + # group=mpu.get_tensor_model_parallel_group())) + + # Load indexed dataset. + print_rank_0(' > loading indexed mapping from {}'.format( + indexmap_filename)) + start_time = time.time() + samples_mapping = np.load( + indexmap_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + samples_mapping.shape[0])) + + return samples_mapping diff --git a/fengshen/data/megatron_dataloader/helpers.cpp b/fengshen/data/megatron_dataloader/helpers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..31277dd1ce3a449bf962ba5a4d6343e7a9c0b5f9 --- /dev/null +++ b/fengshen/data/megatron_dataloader/helpers.cpp @@ -0,0 +1,794 @@ +/* + coding=utf-8 + Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +/* Helper methods for fast index mapping builds */ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; +using namespace std; + +const int32_t LONG_SENTENCE_LEN = 512; + +void build_blending_indices(py::array_t &dataset_index, + py::array_t &dataset_sample_index, + const py::array_t &weights, + const int32_t num_datasets, + const int64_t size, const bool verbose) +{ + /* Given multiple datasets and a weighting array, build samples + such that it follows those wieghts.*/ + + if (verbose) + { + std::cout << "> building indices for blendable datasets ..." << std::endl; + } + + // Get the pointer access without the checks. + auto dataset_index_ptr = dataset_index.mutable_unchecked<1>(); + auto dataset_sample_index_ptr = dataset_sample_index.mutable_unchecked<1>(); + auto weights_ptr = weights.unchecked<1>(); + + // Initialize buffer for number of samples used for each dataset. + int64_t current_samples[num_datasets]; + for (int64_t i = 0; i < num_datasets; ++i) + { + current_samples[i] = 0; + } + + // For each sample: + for (int64_t sample_idx = 0; sample_idx < size; ++sample_idx) + { + + // Determine where the max error in sampling is happening. + auto sample_idx_double = std::max(static_cast(sample_idx), 1.0); + int64_t max_error_index = 0; + double max_error = weights_ptr[0] * sample_idx_double - + static_cast(current_samples[0]); + for (int64_t dataset_idx = 1; dataset_idx < num_datasets; ++dataset_idx) + { + double error = weights_ptr[dataset_idx] * sample_idx_double - + static_cast(current_samples[dataset_idx]); + if (error > max_error) + { + max_error = error; + max_error_index = dataset_idx; + } + } + + // Populate the indices. + dataset_index_ptr[sample_idx] = static_cast(max_error_index); + dataset_sample_index_ptr[sample_idx] = current_samples[max_error_index]; + + // Update the total samples. + current_samples[max_error_index] += 1; + } + + // print info + if (verbose) + { + std::cout << " > sample ratios:" << std::endl; + for (int64_t dataset_idx = 0; dataset_idx < num_datasets; ++dataset_idx) + { + auto ratio = static_cast(current_samples[dataset_idx]) / + static_cast(size); + std::cout << " dataset " << dataset_idx << ", input: " << weights_ptr[dataset_idx] << ", achieved: " << ratio << std::endl; + } + } +} + +py::array build_sample_idx(const py::array_t &sizes_, + const py::array_t &doc_idx_, + const int32_t seq_length, + const int32_t num_epochs, + const int64_t tokens_per_epoch) +{ + /* Sample index (sample_idx) is used for gpt2 like dataset for which + the documents are flattened and the samples are built based on this + 1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2] + where [..., 0] contains the index into `doc_idx` and [..., 1] is the + starting offset in that document.*/ + + // Consistency checks. + assert(seq_length > 1); + assert(num_epochs > 0); + assert(tokens_per_epoch > 1); + + // Remove bound checks. + auto sizes = sizes_.unchecked<1>(); + auto doc_idx = doc_idx_.unchecked<1>(); + + // Mapping and it's length (1D). + int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length; + int32_t *sample_idx = new int32_t[2 * (num_samples + 1)]; + + cout << " using:" << endl + << std::flush; + cout << " number of documents: " << doc_idx_.shape(0) / num_epochs << endl + << std::flush; + cout << " number of epochs: " << num_epochs << endl + << std::flush; + cout << " sequence length: " << seq_length << endl + << std::flush; + cout << " total number of samples: " << num_samples << endl + << std::flush; + + // Index into sample_idx. + int64_t sample_index = 0; + // Index into doc_idx. + int64_t doc_idx_index = 0; + // Begining offset for each document. + int32_t doc_offset = 0; + // Start with first document and no offset. + sample_idx[2 * sample_index] = doc_idx_index; + sample_idx[2 * sample_index + 1] = doc_offset; + ++sample_index; + + while (sample_index <= num_samples) + { + // Start with a fresh sequence. + int32_t remaining_seq_length = seq_length + 1; + while (remaining_seq_length != 0) + { + // Get the document length. + auto doc_id = doc_idx[doc_idx_index]; + auto doc_length = sizes[doc_id] - doc_offset; + // And add it to the current sequence. + remaining_seq_length -= doc_length; + // If we have more than a full sequence, adjust offset and set + // remaining length to zero so we return from the while loop. + // Note that -1 here is for the same reason we have -1 in + // `_num_epochs` calculations. + if (remaining_seq_length <= 0) + { + doc_offset += (remaining_seq_length + doc_length - 1); + remaining_seq_length = 0; + } + else + { + // Otherwise, start from the begining of the next document. + ++doc_idx_index; + doc_offset = 0; + } + } + // Record the sequence. + sample_idx[2 * sample_index] = doc_idx_index; + sample_idx[2 * sample_index + 1] = doc_offset; + ++sample_index; + } + + // Method to deallocate memory. + py::capsule free_when_done(sample_idx, [](void *mem_) + { + int32_t *mem = reinterpret_cast(mem_); + delete[] mem; + }); + + // Return the numpy array. + const auto byte_size = sizeof(int32_t); + return py::array(std::vector{num_samples + 1, 2}, // shape + {2 * byte_size, byte_size}, // C-style contiguous strides + sample_idx, // the data pointer + free_when_done); // numpy array references +} + +inline int32_t get_target_sample_len(const int32_t short_seq_ratio, + const int32_t max_length, + std::mt19937 &rand32_gen) +{ + /* Training sample length. */ + if (short_seq_ratio == 0) + { + return max_length; + } + const auto random_number = rand32_gen(); + if ((random_number % short_seq_ratio) == 0) + { + return 2 + random_number % (max_length - 1); + } + return max_length; +} + +template +py::array build_mapping_impl(const py::array_t &docs_, + const py::array_t &sizes_, + const int32_t num_epochs, + const uint64_t max_num_samples, + const int32_t max_seq_length, + const double short_seq_prob, + const int32_t seed, + const bool verbose, + const int32_t min_num_sent) +{ + /* Build a mapping of (start-index, end-index, sequence-length) where + start and end index are the indices of the sentences in the sample + and sequence-length is the target sequence length. + */ + + // Consistency checks. + assert(num_epochs > 0); + assert(max_seq_length > 1); + assert(short_seq_prob >= 0.0); + assert(short_seq_prob <= 1.0); + assert(seed > 0); + + // Remove bound checks. + auto docs = docs_.unchecked<1>(); + auto sizes = sizes_.unchecked<1>(); + + // For efficiency, convert probability to ratio. Note: rand() generates int. + int32_t short_seq_ratio = 0; + if (short_seq_prob > 0) + { + short_seq_ratio = static_cast(round(1.0 / short_seq_prob)); + } + + if (verbose) + { + const auto sent_start_index = docs[0]; + const auto sent_end_index = docs[docs_.shape(0) - 1]; + const auto num_sentences = sent_end_index - sent_start_index; + cout << " using:" << endl + << std::flush; + cout << " number of documents: " << docs_.shape(0) - 1 << endl + << std::flush; + cout << " sentences range: [" << sent_start_index << ", " << sent_end_index << ")" << endl + << std::flush; + cout << " total number of sentences: " << num_sentences << endl + << std::flush; + cout << " number of epochs: " << num_epochs << endl + << std::flush; + cout << " maximum number of samples: " << max_num_samples << endl + << std::flush; + cout << " maximum sequence length: " << max_seq_length << endl + << std::flush; + cout << " short sequence probability: " << short_seq_prob << endl + << std::flush; + cout << " short sequence ration (1/prob): " << short_seq_ratio << endl + << std::flush; + cout << " seed: " << seed << endl + << std::flush; + } + + // Mapping and it's length (1D). + int64_t num_samples = -1; + DocIdx *maps = NULL; + + // Perform two iterations, in the first iteration get the size + // and allocate memory and in the second iteration populate the map. + bool second = false; + for (int32_t iteration = 0; iteration < 2; ++iteration) + { + + // Set the seed so both iterations produce the same results. + std::mt19937 rand32_gen(seed); + + // Set the flag on second iteration. + second = (iteration == 1); + + // Counters: + uint64_t empty_docs = 0; + uint64_t one_sent_docs = 0; + uint64_t long_sent_docs = 0; + + // Current map index. + uint64_t map_index = 0; + + // For each epoch: + for (int32_t epoch = 0; epoch < num_epochs; ++epoch) + { + if (map_index >= max_num_samples) + { + if (verbose && (!second)) + { + cout << " reached " << max_num_samples << " samples after " + << epoch << " epochs ..." << endl + << std::flush; + } + break; + } + // For each document: + for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) + { + + // Document sentences are in [sent_index_first, sent_index_last) + const auto sent_index_first = docs[doc]; + const auto sent_index_last = docs[doc + 1]; + + // At the begining of the document previous index is the + // start index. + auto prev_start_index = sent_index_first; + + // Remaining documents. + auto num_remain_sent = sent_index_last - sent_index_first; + + // Some bookkeeping + if ((epoch == 0) && (!second)) + { + if (num_remain_sent == 0) + { + ++empty_docs; + } + if (num_remain_sent == 1) + { + ++one_sent_docs; + } + } + + // Detect documents with long sentences. + bool contains_long_sentence = false; + if (num_remain_sent > 1) + { + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + if (sizes[sent_index] > LONG_SENTENCE_LEN) + { + if ((epoch == 0) && (!second)) + { + ++long_sent_docs; + } + contains_long_sentence = true; + break; + } + } + } + + // If we have more than two sentences. + if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) + { + + // Set values. + auto seq_len = int32_t{0}; + auto num_sent = int32_t{0}; + auto target_seq_len = get_target_sample_len(short_seq_ratio, + max_seq_length, + rand32_gen); + + // Loop through sentences. + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + + // Add the size and number of sentences. + seq_len += sizes[sent_index]; + ++num_sent; + --num_remain_sent; + + // If we have reached the target length. + // and if not only one sentence is left in the document. + // and if we have at least two sentneces. + // and if we have reached end of the document. + if (((seq_len >= target_seq_len) && + (num_remain_sent > 1) && + (num_sent >= min_num_sent)) || + (num_remain_sent == 0)) + { + + // Check for overflow. + if ((3 * map_index + 2) > + std::numeric_limits::max()) + { + cout << "number of samples exceeded maximum " + << "allowed by type int64: " + << std::numeric_limits::max() + << endl; + throw std::overflow_error("Number of samples"); + } + + // Populate the map. + if (second) + { + const auto map_index_0 = 3 * map_index; + maps[map_index_0] = static_cast(prev_start_index); + maps[map_index_0 + 1] = static_cast(sent_index + 1); + maps[map_index_0 + 2] = static_cast(target_seq_len); + } + + // Update indices / counters. + ++map_index; + prev_start_index = sent_index + 1; + target_seq_len = get_target_sample_len(short_seq_ratio, + max_seq_length, + rand32_gen); + seq_len = 0; + num_sent = 0; + } + + } // for (auto sent_index=sent_index_first; ... + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { + + if (!second) + { + if (verbose) + { + cout << " number of empty documents: " << empty_docs << endl + << std::flush; + cout << " number of documents with one sentence: " << one_sent_docs << endl + << std::flush; + cout << " number of documents with long sentences: " << long_sent_docs << endl + << std::flush; + cout << " will create mapping for " << map_index << " samples" << endl + << std::flush; + } + assert(maps == NULL); + assert(num_samples < 0); + maps = new DocIdx[3 * map_index]; + num_samples = static_cast(map_index); + } + + } // for (int iteration=0; iteration < 2; ++iteration) { + + // Shuffle. + // We need a 64 bit random number generator as we might have more + // than 2 billion samples. + std::mt19937_64 rand64_gen(seed + 1); + for (auto i = (num_samples - 1); i > 0; --i) + { + const auto j = static_cast(rand64_gen() % (i + 1)); + const auto i0 = 3 * i; + const auto j0 = 3 * j; + // Swap values. + swap(maps[i0], maps[j0]); + swap(maps[i0 + 1], maps[j0 + 1]); + swap(maps[i0 + 2], maps[j0 + 2]); + } + + // Method to deallocate memory. + py::capsule free_when_done(maps, [](void *mem_) + { + DocIdx *mem = reinterpret_cast(mem_); + delete[] mem; + }); + + // Return the numpy array. + const auto byte_size = sizeof(DocIdx); + return py::array(std::vector{num_samples, 3}, // shape + {3 * byte_size, byte_size}, // C-style contiguous strides + maps, // the data pointer + free_when_done); // numpy array references +} + +py::array build_mapping(const py::array_t &docs_, + const py::array_t &sizes_, + const int num_epochs, + const uint64_t max_num_samples, + const int max_seq_length, + const double short_seq_prob, + const int seed, + const bool verbose, + const int32_t min_num_sent) +{ + + if (sizes_.size() > std::numeric_limits::max()) + { + if (verbose) + { + cout << " using uint64 for data mapping..." << endl + << std::flush; + } + return build_mapping_impl(docs_, sizes_, num_epochs, + max_num_samples, max_seq_length, + short_seq_prob, seed, verbose, + min_num_sent); + } + else + { + if (verbose) + { + cout << " using uint32 for data mapping..." << endl + << std::flush; + } + return build_mapping_impl(docs_, sizes_, num_epochs, + max_num_samples, max_seq_length, + short_seq_prob, seed, verbose, + min_num_sent); + } +} + +template +py::array build_blocks_mapping_impl(const py::array_t &docs_, + const py::array_t &sizes_, + const py::array_t &titles_sizes_, + const int32_t num_epochs, + const uint64_t max_num_samples, + const int32_t max_seq_length, + const int32_t seed, + const bool verbose, + const bool use_one_sent_blocks) +{ + /* Build a mapping of (start-index, end-index, sequence-length) where + start and end index are the indices of the sentences in the sample + and sequence-length is the target sequence length. + */ + + // Consistency checks. + assert(num_epochs > 0); + assert(max_seq_length > 1); + assert(seed > 0); + + // Remove bound checks. + auto docs = docs_.unchecked<1>(); + auto sizes = sizes_.unchecked<1>(); + auto titles_sizes = titles_sizes_.unchecked<1>(); + + if (verbose) + { + const auto sent_start_index = docs[0]; + const auto sent_end_index = docs[docs_.shape(0) - 1]; + const auto num_sentences = sent_end_index - sent_start_index; + cout << " using:" << endl + << std::flush; + cout << " number of documents: " << docs_.shape(0) - 1 << endl + << std::flush; + cout << " sentences range: [" << sent_start_index << ", " << sent_end_index << ")" << endl + << std::flush; + cout << " total number of sentences: " << num_sentences << endl + << std::flush; + cout << " number of epochs: " << num_epochs << endl + << std::flush; + cout << " maximum number of samples: " << max_num_samples << endl + << std::flush; + cout << " maximum sequence length: " << max_seq_length << endl + << std::flush; + cout << " seed: " << seed << endl + << std::flush; + } + + // Mapping and its length (1D). + int64_t num_samples = -1; + DocIdx *maps = NULL; + + // Acceptable number of sentences per block. + int min_num_sent = 2; + if (use_one_sent_blocks) + { + min_num_sent = 1; + } + + // Perform two iterations, in the first iteration get the size + // and allocate memory and in the second iteration populate the map. + bool second = false; + for (int32_t iteration = 0; iteration < 2; ++iteration) + { + + // Set the flag on second iteration. + second = (iteration == 1); + + // Current map index. + uint64_t map_index = 0; + + uint64_t empty_docs = 0; + uint64_t one_sent_docs = 0; + uint64_t long_sent_docs = 0; + // For each epoch: + for (int32_t epoch = 0; epoch < num_epochs; ++epoch) + { + // assign every block a unique id + int32_t block_id = 0; + + if (map_index >= max_num_samples) + { + if (verbose && (!second)) + { + cout << " reached " << max_num_samples << " samples after " + << epoch << " epochs ..." << endl + << std::flush; + } + break; + } + // For each document: + for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) + { + + // Document sentences are in [sent_index_first, sent_index_last) + const auto sent_index_first = docs[doc]; + const auto sent_index_last = docs[doc + 1]; + const auto target_seq_len = max_seq_length - titles_sizes[doc]; + + // At the begining of the document previous index is the + // start index. + auto prev_start_index = sent_index_first; + + // Remaining documents. + auto num_remain_sent = sent_index_last - sent_index_first; + + // Some bookkeeping + if ((epoch == 0) && (!second)) + { + if (num_remain_sent == 0) + { + ++empty_docs; + } + if (num_remain_sent == 1) + { + ++one_sent_docs; + } + } + // Detect documents with long sentences. + bool contains_long_sentence = false; + if (num_remain_sent >= min_num_sent) + { + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + if (sizes[sent_index] > LONG_SENTENCE_LEN) + { + if ((epoch == 0) && (!second)) + { + ++long_sent_docs; + } + contains_long_sentence = true; + break; + } + } + } + // If we have enough sentences and no long sentences. + if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) + { + + // Set values. + auto seq_len = int32_t{0}; + auto num_sent = int32_t{0}; + + // Loop through sentences. + for (auto sent_index = sent_index_first; + sent_index < sent_index_last; ++sent_index) + { + + // Add the size and number of sentences. + seq_len += sizes[sent_index]; + ++num_sent; + --num_remain_sent; + + // If we have reached the target length. + // and there are an acceptable number of sentences left + // and if we have at least the minimum number of sentences. + // or if we have reached end of the document. + if (((seq_len >= target_seq_len) && + (num_remain_sent >= min_num_sent) && + (num_sent >= min_num_sent)) || + (num_remain_sent == 0)) + { + + // Populate the map. + if (second) + { + const auto map_index_0 = 4 * map_index; + // Each sample has 4 items: the starting sentence index, ending sentence index, + // the index of the document from which the block comes (used for fetching titles) + // and the unique id of the block (used for creating block indexes) + + maps[map_index_0] = static_cast(prev_start_index); + maps[map_index_0 + 1] = static_cast(sent_index + 1); + maps[map_index_0 + 2] = static_cast(doc); + maps[map_index_0 + 3] = static_cast(block_id); + } + + // Update indices / counters. + ++map_index; + ++block_id; + prev_start_index = sent_index + 1; + seq_len = 0; + num_sent = 0; + } + } // for (auto sent_index=sent_index_first; ... + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { + + if (!second) + { + if (verbose) + { + cout << " number of empty documents: " << empty_docs << endl + << std::flush; + cout << " number of documents with one sentence: " << one_sent_docs << endl + << std::flush; + cout << " number of documents with long sentences: " << long_sent_docs << endl + << std::flush; + cout << " will create mapping for " << map_index << " samples" << endl + << std::flush; + } + assert(maps == NULL); + assert(num_samples < 0); + maps = new DocIdx[4 * map_index]; + num_samples = static_cast(map_index); + } + + } // for (int iteration=0; iteration < 2; ++iteration) { + + // Shuffle. + // We need a 64 bit random number generator as we might have more + // than 2 billion samples. + std::mt19937_64 rand64_gen(seed + 1); + for (auto i = (num_samples - 1); i > 0; --i) + { + const auto j = static_cast(rand64_gen() % (i + 1)); + const auto i0 = 4 * i; + const auto j0 = 4 * j; + // Swap values. + swap(maps[i0], maps[j0]); + swap(maps[i0 + 1], maps[j0 + 1]); + swap(maps[i0 + 2], maps[j0 + 2]); + swap(maps[i0 + 3], maps[j0 + 3]); + } + + // Method to deallocate memory. + py::capsule free_when_done(maps, [](void *mem_) + { + DocIdx *mem = reinterpret_cast(mem_); + delete[] mem; + }); + + // Return the numpy array. + const auto byte_size = sizeof(DocIdx); + return py::array(std::vector{num_samples, 4}, // shape + {4 * byte_size, byte_size}, // C-style contiguous strides + maps, // the data pointer + free_when_done); // numpy array references +} + +py::array build_blocks_mapping(const py::array_t &docs_, + const py::array_t &sizes_, + const py::array_t &titles_sizes_, + const int num_epochs, + const uint64_t max_num_samples, + const int max_seq_length, + const int seed, + const bool verbose, + const bool use_one_sent_blocks) +{ + + if (sizes_.size() > std::numeric_limits::max()) + { + if (verbose) + { + cout << " using uint64 for data mapping..." << endl + << std::flush; + } + return build_blocks_mapping_impl(docs_, sizes_, titles_sizes_, + num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks); + } + else + { + if (verbose) + { + cout << " using uint32 for data mapping..." << endl + << std::flush; + } + return build_blocks_mapping_impl(docs_, sizes_, titles_sizes_, + num_epochs, max_num_samples, max_seq_length, seed, verbose, use_one_sent_blocks); + } +} + +PYBIND11_MODULE(helpers, m) +{ + m.def("build_mapping", &build_mapping); + m.def("build_blocks_mapping", &build_blocks_mapping); + m.def("build_sample_idx", &build_sample_idx); + m.def("build_blending_indices", &build_blending_indices); +} diff --git a/fengshen/data/megatron_dataloader/indexed_dataset.py b/fengshen/data/megatron_dataloader/indexed_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9eba91d303ab11884d993b707ca1d166f540588b --- /dev/null +++ b/fengshen/data/megatron_dataloader/indexed_dataset.py @@ -0,0 +1,585 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +# copied from fairseq/fairseq/data/indexed_dataset.py +# Removed IndexedRawTextDataset since it relied on Fairseq dictionary +# other slight modifications to remove fairseq dependencies +# Added document index to index file and made it accessible. +# An empty sentence no longer separates documents. + +from functools import lru_cache +import os +import shutil +import struct +from itertools import accumulate + +import numpy as np +import torch +from fengshen.data.megatron_dataloader.utils import print_rank_0 + + +def __best_fitting_dtype(vocab_size=None): + if vocab_size is not None and vocab_size < 65500: + return np.uint16 + else: + return np.int32 + + +def get_available_dataset_impl(): + return ['lazy', 'cached', 'mmap'] + + +def infer_dataset_impl(path): + if IndexedDataset.exists(path): + with open(index_file_path(path), 'rb') as f: + magic = f.read(8) + if magic == IndexedDataset._HDR_MAGIC: + return 'cached' + elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]: + return 'mmap' + else: + return None + else: + print(f"Dataset does not exist: {path}") + print("Path should be a basename that both .idx and " + ".bin can be appended to get full filenames.") + return None + + +def make_builder(out_file, impl, vocab_size=None): + if impl == 'mmap': + return MMapIndexedDatasetBuilder(out_file, + dtype=__best_fitting_dtype(vocab_size)) + else: + return IndexedDatasetBuilder(out_file) + + +def make_dataset(path, impl, skip_warmup=False): + if not IndexedDataset.exists(path): + print(f"Dataset does not exist: {path}") + print("Path should be a basename that both .idx " + "and .bin can be appended to get full filenames.") + return None + if impl == 'infer': + impl = infer_dataset_impl(path) + if impl == 'lazy' and IndexedDataset.exists(path): + return IndexedDataset(path) + elif impl == 'cached' and IndexedDataset.exists(path): + return IndexedCachedDataset(path) + elif impl == 'mmap' and MMapIndexedDataset.exists(path): + return MMapIndexedDataset(path, skip_warmup) + print(f"Unknown dataset implementation: {impl}") + return None + + +def dataset_exists(path, impl): + if impl == 'mmap': + return MMapIndexedDataset.exists(path) + else: + return IndexedDataset.exists(path) + + +def read_longs(f, n): + a = np.empty(n, dtype=np.int64) + f.readinto(a) + return a + + +def write_longs(f, a): + f.write(np.array(a, dtype=np.int64)) + + +dtypes = { + 1: np.uint8, + 2: np.int8, + 3: np.int16, + 4: np.int32, + 5: np.int64, + 6: np.float, + 7: np.double, + 8: np.uint16 +} + + +def code(dtype): + for k in dtypes.keys(): + if dtypes[k] == dtype: + return k + raise ValueError(dtype) + + +def index_file_path(prefix_path): + return prefix_path + '.idx' + + +def data_file_path(prefix_path): + return prefix_path + '.bin' + + +def create_doc_idx(sizes): + doc_idx = [0] + for i, s in enumerate(sizes): + if s == 0: + doc_idx.append(i + 1) + return doc_idx + + +class IndexedDataset(torch.utils.data.Dataset): + """Loader for IndexedDataset""" + _HDR_MAGIC = b'TNTIDX\x00\x00' + + def __init__(self, path): + super().__init__() + self.path = path + self.data_file = None + self.read_index(path) + + def read_index(self, path): + with open(index_file_path(path), 'rb') as f: + magic = f.read(8) + assert magic == self._HDR_MAGIC, ( + 'Index file doesn\'t match expected format. ' + 'Make sure that --dataset-impl is configured properly.' + ) + version = f.read(8) + assert struct.unpack('= self._len: + raise IndexError('index out of range') + + def __del__(self): + if self.data_file: + self.data_file.close() + + # @lru_cache(maxsize=8) + def __getitem__(self, idx): + if not self.data_file: + self.read_data(self.path) + if isinstance(idx, int): + i = idx + self.check_index(i) + tensor_size = self.sizes[ + self.dim_offsets[i]:self.dim_offsets[i + 1]] + a = np.empty(tensor_size, dtype=self.dtype) + self.data_file.seek(self.data_offsets[i] * self.element_size) + self.data_file.readinto(a) + return a + elif isinstance(idx, slice): + start, stop, step = idx.indices(len(self)) + if step != 1: + raise ValueError( + "Slices into indexed_dataset must be contiguous") + sizes = self.sizes[self.dim_offsets[start]:self.dim_offsets[stop]] + size = sum(sizes) + a = np.empty(size, dtype=self.dtype) + self.data_file.seek(self.data_offsets[start] * self.element_size) + self.data_file.readinto(a) + offsets = list(accumulate(sizes)) + sents = np.split(a, offsets[:-1]) + return sents + + def __len__(self): + return self._len + + def num_tokens(self, index): + return self.sizes[index] + + def size(self, index): + return self.sizes[index] + + @staticmethod + def exists(path): + return ( + os.path.exists(index_file_path(path)) and os.path.exists( + data_file_path(path)) + ) + + @property + def supports_prefetch(self): + return False # avoid prefetching to save memory + + +class IndexedCachedDataset(IndexedDataset): + + def __init__(self, path): + super().__init__(path) + self.cache = None + self.cache_index = {} + + @property + def supports_prefetch(self): + return True + + def prefetch(self, indices): + if all(i in self.cache_index for i in indices): + return + if not self.data_file: + self.read_data(self.path) + indices = sorted(set(indices)) + total_size = 0 + for i in indices: + total_size += self.data_offsets[i + 1] - self.data_offsets[i] + self.cache = np.empty(total_size, dtype=self.dtype) + ptx = 0 + self.cache_index.clear() + for i in indices: + self.cache_index[i] = ptx + size = self.data_offsets[i + 1] - self.data_offsets[i] + a = self.cache[ptx: ptx + size] + self.data_file.seek(self.data_offsets[i] * self.element_size) + self.data_file.readinto(a) + ptx += size + if self.data_file: + # close and delete data file after prefetch so we can pickle + self.data_file.close() + self.data_file = None + + # @lru_cache(maxsize=8) + def __getitem__(self, idx): + if isinstance(idx, int): + i = idx + self.check_index(i) + tensor_size = self.sizes[ + self.dim_offsets[i]:self.dim_offsets[i + 1]] + a = np.empty(tensor_size, dtype=self.dtype) + ptx = self.cache_index[i] + np.copyto(a, self.cache[ptx: ptx + a.size]) + return a + elif isinstance(idx, slice): + # Hack just to make this work, can optimizer later if necessary + sents = [] + for i in range(*idx.indices(len(self))): + sents.append(self[i]) + return sents + + +class IndexedDatasetBuilder(object): + element_sizes = { + np.uint8: 1, + np.int8: 1, + np.int16: 2, + np.int32: 4, + np.int64: 8, + np.float: 4, + np.double: 8 + } + + def __init__(self, out_file, dtype=np.int32): + self.out_file = open(out_file, 'wb') + self.dtype = dtype + self.data_offsets = [0] + self.dim_offsets = [0] + self.sizes = [] + self.element_size = self.element_sizes[self.dtype] + self.doc_idx = [0] + + def add_item(self, tensor): + bytes = self.out_file.write(np.array(tensor.numpy(), dtype=self.dtype)) + self.data_offsets.append( + self.data_offsets[-1] + bytes / self.element_size) + for s in tensor.size(): + self.sizes.append(s) + self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size())) + + def end_document(self): + self.doc_idx.append(len(self.sizes)) + + def merge_file_(self, another_file): + index = IndexedDataset(another_file) + assert index.dtype == self.dtype + + begin = self.data_offsets[-1] + for offset in index.data_offsets[1:]: + self.data_offsets.append(begin + offset) + self.sizes.extend(index.sizes) + begin = self.dim_offsets[-1] + for dim_offset in index.dim_offsets[1:]: + self.dim_offsets.append(begin + dim_offset) + + with open(data_file_path(another_file), 'rb') as f: + while True: + data = f.read(1024) + if data: + self.out_file.write(data) + else: + break + + def finalize(self, index_file): + self.out_file.close() + index = open(index_file, 'wb') + index.write(b'TNTIDX\x00\x00') + index.write(struct.pack(' None: + return super().setup(stage) + + def train_dataloader(self): + return DataLoader( + self.train_dataset, + batch_size=self.hparams.train_batchsize, + shuffle=True, + num_workers=self.hparams.num_workers, + collate_fn=self.collate_fn, + ) + + def val_dataloader(self): + return DataLoader( + self.valid_dataset, + batch_size=self.hparams.eval_batchsize, + shuffle=True, + num_workers=self.hparams.num_workers, + collate_fn=self.collate_fn, + ) + + def test_dataloader(self): + return DataLoader( + self.test_dataset, + batch_size=self.hparams.test_batchsize, + shuffle=True, + num_workers=self.hparams.num_workers, + collate_fn=self.collate_fn, + ) diff --git a/fengshen/data/mmap_dataloader/mmap_index_dataset.py b/fengshen/data/mmap_dataloader/mmap_index_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..53b290c12a8825a483f14ca0535a813b36477fa1 --- /dev/null +++ b/fengshen/data/mmap_dataloader/mmap_index_dataset.py @@ -0,0 +1,53 @@ +import numpy as np +import torch +from typing import List +from torch.utils.data import Dataset + + +class MMapIndexDataset(Dataset): + # datapaths 是所有的内存映射文件的路径 + # input_tensor_name 是输入的tensor的名字 例如 ['input_ids'] 会存储在对应的文件里面 + def __init__(self, datapaths: List[str], input_tensor_name: List[str]): + dict_idx_fp = {} + dict_bin_fp = {} + idx_len = [] + for tensor_name in input_tensor_name: + idx_fp = [] + bin_fp = [] + len = 0 + for data_path in datapaths: + idx_fp += [np.load( + data_path + '_' + tensor_name + '.npy', mmap_mode='r')] + bin_fp += [np.memmap( + data_path + '_' + tensor_name + '.bin', + dtype='long', + mode='r')] + len += idx_fp[-1].shape[0] + idx_len += [idx_fp[-1].shape[0]] + dict_idx_fp[tensor_name] = idx_fp + dict_bin_fp[tensor_name] = bin_fp + #  通常情况下不同的tensor的长度是一样的 + self._len = len + + self._input_tensor_name = input_tensor_name + self._dict_idx_fp = dict_idx_fp + self._dict_bin_fp = dict_bin_fp + self._idx_len = idx_len + + def __len__(self): + return self._len + + def __getitem__(self, idx): + sample = {} + for i in range(len(self._idx_len)): + if idx >= self._idx_len[i]: + idx -= self._idx_len[i] + else: + break + for tensor_name in self._input_tensor_name: + sample[tensor_name] = torch.tensor(self._dict_bin_fp[tensor_name][i][ + self._dict_idx_fp[tensor_name][i][idx, 0]: + self._dict_idx_fp[tensor_name][i][idx, 1] + ], dtype=torch.long) + # print(sample) + return sample diff --git a/fengshen/data/preprocess.py b/fengshen/data/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..9bad5790a5799b96f2e164d825c0b1f8ec0c2dfb --- /dev/null +++ b/fengshen/data/preprocess.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/fengshen/data/sequence_tagging_dataloader/sequence_tagging_collator.py b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_collator.py new file mode 100644 index 0000000000000000000000000000000000000000..b21ff7a0f9152ac16cb434078ac8436dcceeec1a --- /dev/null +++ b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_collator.py @@ -0,0 +1,274 @@ +from dataclasses import dataclass +from torch.utils.data._utils.collate import default_collate + +import copy +import torch +import numpy as np + +@dataclass +class CollatorForLinear: + args = None + tokenizer = None + label2id = None + + def __call__(self, samples): + cls_token = "[CLS]" + sep_token = "[SEP]" + pad_token = 0 + special_tokens_count = 2 + segment_id = 0 + + features=[] + + for (ex_index, example) in enumerate(samples): + tokens = copy.deepcopy(example['text_a']) + + label_ids = [self.label2id[x] for x in example['labels']] + + if len(tokens) > self.args.max_seq_length - special_tokens_count: + tokens = tokens[: (self.args.max_seq_length - special_tokens_count)] + label_ids = label_ids[: (self.args.max_seq_length - special_tokens_count)] + + tokens += [sep_token] + label_ids += [self.label2id["O"]] + segment_ids = [segment_id] * len(tokens) + + tokens = [cls_token] + tokens + label_ids = [self.label2id["O"]] + label_ids + segment_ids = [segment_id] + segment_ids + + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + input_len = len(label_ids) + padding_length = self.args.max_seq_length - len(input_ids) + + input_ids += [pad_token] * padding_length + input_mask += [0] * padding_length + segment_ids += [segment_id] * padding_length + label_ids += [pad_token] * padding_length + + assert len(input_ids) == self.args.max_seq_length + assert len(input_mask) == self.args.max_seq_length + assert len(segment_ids) == self.args.max_seq_length + assert len(label_ids) == self.args.max_seq_length + + features.append({ + 'input_ids':torch.tensor(input_ids), + 'attention_mask':torch.tensor(input_mask), + 'input_len':torch.tensor(input_len), + 'token_type_ids':torch.tensor(segment_ids), + 'labels':torch.tensor(label_ids), + }) + + return default_collate(features) + +@dataclass +class CollatorForCrf: + args = None + tokenizer = None + label2id = None + + def __call__(self, samples): + features = [] + cls_token = "[CLS]" + sep_token = "[SEP]" + pad_token = 0 + special_tokens_count = 2 + segment_id = 0 + + for (ex_index, example) in enumerate(samples): + tokens = copy.deepcopy(example['text_a']) + + label_ids = [self.label2id[x] for x in example['labels']] + + if len(tokens) > self.args.max_seq_length - special_tokens_count: + tokens = tokens[: (self.args.max_seq_length - special_tokens_count)] + label_ids = label_ids[: (self.args.max_seq_length - special_tokens_count)] + + tokens += [sep_token] + label_ids += [self.label2id["O"]] + segment_ids = [segment_id] * len(tokens) + + tokens = [cls_token] + tokens + label_ids = [self.label2id["O"]] + label_ids + segment_ids = [segment_id] + segment_ids + + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + input_len = len(label_ids) + padding_length = self.args.max_seq_length - len(input_ids) + + input_ids += [pad_token] * padding_length + input_mask += [0] * padding_length + segment_ids += [segment_id] * padding_length + label_ids += [pad_token] * padding_length + + assert len(input_ids) == self.args.max_seq_length + assert len(input_mask) == self.args.max_seq_length + assert len(segment_ids) == self.args.max_seq_length + assert len(label_ids) == self.args.max_seq_length + + features.append({ + 'input_ids':torch.tensor(input_ids), + 'attention_mask':torch.tensor(input_mask), + 'input_len':torch.tensor(input_len), + 'token_type_ids':torch.tensor(segment_ids), + 'labels':torch.tensor(label_ids), + }) + + return default_collate(features) + + +@dataclass +class CollatorForSpan: + args = None + tokenizer = None + label2id = None + + def __call__(self, samples): + + features = [] + cls_token = "[CLS]" + sep_token = "[SEP]" + pad_token = 0 + special_tokens_count = 2 + max_entities_count = 100 + segment_id = 0 + + for (ex_index, example) in enumerate(samples): + subjects = copy.deepcopy(example['subject']) + tokens = copy.deepcopy(example['text_a']) + start_ids = [0] * len(tokens) + end_ids = [0] * len(tokens) + subject_ids = [] + for subject in subjects: + label = subject[0] + start = subject[1] + end = subject[2] + start_ids[start] = self.label2id[label] + end_ids[end] = self.label2id[label] + subject_ids.append([self.label2id[label], start, end]) + + subject_ids+=[[-1,-1,-1]]*(max_entities_count-len(subject_ids)) + + if len(tokens) > self.args.max_seq_length - special_tokens_count: + tokens = tokens[: (self.args.max_seq_length - special_tokens_count)] + start_ids = start_ids[: (self.args.max_seq_length - special_tokens_count)] + end_ids = end_ids[: (self.args.max_seq_length - special_tokens_count)] + + tokens += [sep_token] + start_ids += [0] + end_ids += [0] + segment_ids = [segment_id] * len(tokens) + + tokens = [cls_token] + tokens + start_ids = [0] + start_ids + end_ids = [0] + end_ids + segment_ids = [segment_id] + segment_ids + + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + input_len = len(input_ids) + padding_length = self.args.max_seq_length - len(input_ids) + + input_ids += [pad_token] * padding_length + input_mask += [0] * padding_length + segment_ids += [segment_id] * padding_length + start_ids += [0] * padding_length + end_ids += [0] * padding_length + + assert len(input_ids) == self.args.max_seq_length + assert len(input_mask) == self.args.max_seq_length + assert len(segment_ids) == self.args.max_seq_length + assert len(start_ids) == self.args.max_seq_length + assert len(end_ids) == self.args.max_seq_length + + features.append({ + 'input_ids': torch.tensor(np.array(input_ids)), + 'attention_mask': torch.tensor(np.array(input_mask)), + 'token_type_ids': torch.tensor(np.array(segment_ids)), + 'start_positions': torch.tensor(np.array(start_ids)), + 'end_positions': torch.tensor(np.array(end_ids)), + "subjects": torch.tensor(np.array(subject_ids)), + 'input_len': torch.tensor(np.array(input_len)), + }) + + return default_collate(features) + + +@dataclass +class CollatorForBiaffine: + args = None + tokenizer = None + label2id = None + + + def __call__(self, samples): + + features = [] + cls_token = "[CLS]" + sep_token = "[SEP]" + pad_token = 0 + special_tokens_count = 2 + segment_id = 0 + + for (ex_index, example) in enumerate(samples): + subjects = copy.deepcopy(example['subject']) + tokens = copy.deepcopy(example['text_a']) + + span_labels = np.zeros((self.args.max_seq_length,self.args.max_seq_length)) + span_labels[:] = self.label2id["O"] + + for subject in subjects: + label = subject[0] + start = subject[1] + end = subject[2] + if start < self.args.max_seq_length - special_tokens_count and end < self.args.max_seq_length - special_tokens_count: + span_labels[start + 1, end + 1] = self.label2id[label] + + if len(tokens) > self.args.max_seq_length - special_tokens_count: + tokens = tokens[: (self.args.max_seq_length - special_tokens_count)] + + tokens += [sep_token] + span_labels[len(tokens), :] = self.label2id["O"] + span_labels[:, len(tokens)] = self.label2id["O"] + segment_ids = [segment_id] * len(tokens) + + tokens = [cls_token] + tokens + span_labels[0, :] = self.label2id["O"] + span_labels[:, 0] = self.label2id["O"] + segment_ids = [segment_id] + segment_ids + + input_ids = self.tokenizer.convert_tokens_to_ids(tokens) + input_mask = [0] * len(input_ids) + span_mask = np.ones(span_labels.shape) + input_len = len(input_ids) + + padding_length = self.args.max_seq_length - len(input_ids) + + input_ids += [pad_token] * padding_length + input_mask += [0] * padding_length + segment_ids += [segment_id] * padding_length + span_labels[input_len:, :] = 0 + span_labels[:, input_len:] = 0 + span_mask[input_len:, :] = 0 + span_mask[:, input_len:] = 0 + span_mask=np.triu(span_mask,0) + span_mask=np.tril(span_mask,10) + + assert len(input_ids) == self.args.max_seq_length + assert len(input_mask) == self.args.max_seq_length + assert len(segment_ids) == self.args.max_seq_length + assert len(span_labels) == self.args.max_seq_length + assert len(span_labels[0]) == self.args.max_seq_length + + features.append({ + 'input_ids': torch.tensor(np.array(input_ids)), + 'attention_mask': torch.tensor(np.array(input_mask)), + 'token_type_ids': torch.tensor(np.array(segment_ids)), + 'span_labels': torch.tensor(np.array(span_labels)), + 'span_mask': torch.tensor(np.array(span_mask)), + 'input_len': torch.tensor(np.array(input_len)), + }) + + return default_collate(features) \ No newline at end of file diff --git a/fengshen/data/sequence_tagging_dataloader/sequence_tagging_datasets.py b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..f2e53cbf3d6bd3d2185e66dd0b7fdcfa1b8c44d0 --- /dev/null +++ b/fengshen/data/sequence_tagging_dataloader/sequence_tagging_datasets.py @@ -0,0 +1,116 @@ +from torch.utils.data import Dataset +from fengshen.metric.utils_ner import get_entities + +import os + +def get_datasets(args): + processor = DataProcessor(args.data_dir, args.decode_type) + + train_data = TaskDataset(processor=processor, mode="train") + valid_data = TaskDataset(processor=processor, mode="dev") + test_data = TaskDataset(processor=processor, mode="dev") + + return {"train":train_data,"validation":valid_data,"test":test_data} + +# def get_labels(decode_type): +# with open("/cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo/labels.txt") as f: +# label_list = ["[PAD]", "[START]", "[END]"] + +# if decode_type=="crf" or decode_type=="linear": +# for line in f.readlines(): +# label_list.append(line.strip()) +# elif decode_type=="biaffine" or decode_type=="span": +# for line in f.readlines(): +# tag = line.strip().split("-") +# if len(tag) == 1 and tag[0] not in label_list: +# label_list.append(tag[0]) +# elif tag[1] not in label_list: +# label_list.append(tag[1]) + +# label2id={label:id for id,label in enumerate(label_list)} +# id2label={id:label for id,label in enumerate(label_list)} +# return label2id, id2label + +class DataProcessor(object): + def __init__(self, data_dir, decode_type) -> None: + super().__init__() + self.data_dir = data_dir + self.decode_type = decode_type + + def get_examples(self, mode): + return self._create_examples(self._read_text(os.path.join(self.data_dir, mode + ".all.bmes")), mode) + + @staticmethod + def get_labels(args): + with open(os.path.join(args.data_dir, "labels.txt")) as f: + label_list = ["[PAD]", "[START]", "[END]"] + + if args.decode_type=="crf" or args.decode_type=="linear": + for line in f.readlines(): + label_list.append(line.strip()) + elif args.decode_type=="biaffine" or args.decode_type=="span": + for line in f.readlines(): + tag = line.strip().split("-") + if len(tag) == 1 and tag[0] not in label_list: + label_list.append(tag[0]) + elif tag[1] not in label_list: + label_list.append(tag[1]) + + label2id = {label: i for i, label in enumerate(label_list)} + id2label={id:label for id,label in enumerate(label_list)} + return label2id,id2label + + def _create_examples(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line['words'] + labels = [] + for x in line['labels']: + if 'M-' in x: + labels.append(x.replace('M-', 'I-')) + else: + labels.append(x) + subject = get_entities(labels, id2label=None, markup='bioes') + examples.append({'guid':guid, 'text_a':text_a, 'labels':labels, 'subject':subject}) + return examples + + @classmethod + def _read_text(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words = [] + labels = [] + for line in f: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + if words: + lines.append({"words": words, "labels": labels}) + words = [] + labels = [] + else: + splits = line.split() + words.append(splits[0]) + if len(splits) > 1: + labels.append(splits[-1].replace("\n", "")) + else: + # Examples could have no label for mode = "test" + labels.append("O") + if words: + lines.append({"words": words, "labels": labels}) + return lines + + +class TaskDataset(Dataset): + def __init__(self, processor, mode='train'): + super().__init__() + self.data = self.load_data(processor, mode) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.data[index] + + def load_data(self, processor, mode): + examples = processor.get_examples(mode) + return examples \ No newline at end of file diff --git a/fengshen/data/t5_dataloader/t5_datasets.py b/fengshen/data/t5_dataloader/t5_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..4fd55b8d0be1dd61881b8c782a7eea7a6123efdd --- /dev/null +++ b/fengshen/data/t5_dataloader/t5_datasets.py @@ -0,0 +1,562 @@ +# coding=utf8 +import json +from torch.utils.data import Dataset, DataLoader +from tqdm import tqdm +from transformers import BertTokenizer, MT5Config, MT5Tokenizer, BatchEncoding +import torch +import pytorch_lightning as pl +import numpy as np +from itertools import chain +import sys +sys.path.append('../../') + + +def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length): + """This function is copy of `random_spans_helper `__ . + Training parameters to avoid padding with random_spans_noise_mask. + When training a model with random_spans_noise_mask, we would like to set the other + training hyperparmeters in a way that avoids padding. + This function helps us compute these hyperparameters. + We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens, + and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens. + This function tells us the required number of tokens in the raw example (for split_tokens()) + as well as the length of the encoded targets. Note that this function assumes + the inputs and targets will have EOS appended and includes that in the reported length. + Args: + inputs_length: an integer - desired length of the tokenized inputs sequence + noise_density: a float + mean_noise_span_length: a float + Returns: + tokens_length: length of original text in tokens + targets_length: an integer - length in tokens of encoded targets sequence + """ + + def _tokens_length_to_inputs_length_targets_length(tokens_length): + num_noise_tokens = int(round(tokens_length * noise_density)) + num_nonnoise_tokens = tokens_length - num_noise_tokens + num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length)) + # inputs contain all nonnoise tokens, sentinels for all noise spans + # and one EOS token. + _input_length = num_nonnoise_tokens + num_noise_spans + 1 + _output_length = num_noise_tokens + num_noise_spans + 1 + return _input_length, _output_length + + tokens_length = inputs_length + + while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length: + tokens_length += 1 + + inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length( + tokens_length) + + # minor hack to get the targets length to be equal to inputs length + # which is more likely to have been set to a nice round number. + if noise_density == 0.5 and targets_length > inputs_length: + tokens_length -= 1 + targets_length -= 1 + return tokens_length, targets_length + + +class UnsuperviseT5Dataset(Dataset): + ''' + Dataset Used for T5 unsuprvise pretrain. + load_data_type = 0: load raw data from data path and save tokenized data, call function load_data + load_data_type = 1: load tokenized data from path, call function load_tokenized_data + load_data_type = 2: load tokenized data from memery data, call function load_tokenized_memory_data + ''' + + def __init__(self, data_path, args, load_data_type=0, data=None): + super().__init__() + + if args.tokenizer_type == 't5_tokenizer': + if args.new_vocab_path is not None: + self.tokenizer = MT5Tokenizer.from_pretrained(args.new_vocab_path) + else: + self.tokenizer = MT5Tokenizer.from_pretrained(args.pretrained_model_path) + else: + self.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path) + self.noise_density = 0.15 + self.mean_noise_span_length = 3 + self.text_column_name = args.text_column_name + self.dataset_num_workers = args.dataset_num_workers + self.max_seq_length = args.max_seq_length + self.remove_columns = args.remove_columns + # whether load tokenieze data + self.load_data_type = load_data_type + + if self.load_data_type == 0: + # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. + # To ensure that the input length is `max_seq_length`, we need to increase the maximum length + # according to `mlm_probability` and `mean_noise_span_length`. + # We can also define the label length accordingly. + self.expanded_inputs_length, self.targets_length = compute_input_and_target_lengths( + inputs_length=self.max_seq_length, + noise_density=self.noise_density, + mean_noise_span_length=self.mean_noise_span_length, + ) + print('self.expanded_inputs_length, self.targets_length:{},{}'.format( + self.expanded_inputs_length, self.targets_length)) + self.data = self.load_data(data_path) + elif self.load_data_type == 1: + self.data = self.load_tokenized_data(data_path) + else: + assert data is not None + self.data = self.load_tokenized_memory_data(data) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.data[index] + + def load_data(self, data_path): + # TODO: large data process + from data.fs_datasets import load_dataset + samples = load_dataset( + # samples = datasets.load_from_disk(data_path)['train'] + data_path, num_proc=self.dataset_num_workers)['train'] + # print(samples) + tokenized_datasets = samples.map( + self.tokenize_function, + batched=True, + num_proc=self.dataset_num_workers, + # load_from_cache_file=not data_args.overwrite_cache, + ).map( + batched=True, + num_proc=self.dataset_num_workers, + remove_columns=self.remove_columns) + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a + # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value + # might be slower to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co./docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + tokenized_datasets = tokenized_datasets.map( + self.group_texts, + batched=True, + num_proc=self.dataset_num_workers, + # load_from_cache_file=not data_args.overwrite_cache, + ) + return tokenized_datasets + ''' + The function load tokenized data saved from load_data function. + ''' + + def load_tokenized_data(self, data_path): + from data.fs_datasets import load_dataset + samples = load_dataset(data_path)['train'] + return samples + + def load_tokenized_memory_data(self, data): + return data + + # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. + # Since we make sure that all sequences are of the same length, no attention_mask is needed. + def tokenize_function(self, examples): + # 这里add_special_tokens=False,避免句子中间出现eos + return self.tokenizer(examples[self.text_column_name], + add_special_tokens=False, + return_attention_mask=False) + + # Main data processing function that will concatenate all texts from our dataset + # and generate chunks of expanded_inputs_length. + def group_texts(self, examples): + # Concatenate all texts. + concatenated_examples = { + k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= self.expanded_inputs_length: + total_length = ( + total_length // self.expanded_inputs_length) * self.expanded_inputs_length + # Split by chunks of max_len. + result = { + k: [t[i: i + self.expanded_inputs_length] + for i in range(0, total_length, self.expanded_inputs_length)] + for k, t in concatenated_examples.items() + } + return result + + +class UnsuperviseT5DataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('UnsuperviseT5DataModel') + parser.add_argument('--dataset_num_workers', default=8, type=int) + parser.add_argument('--dataloader_num_workers', default=4, type=int) + parser.add_argument( + '--train_data_path', default='wudao_180g_mt5_tokenized', type=str) + parser.add_argument('--train_batchsize', default=2, type=int) + parser.add_argument('--valid_batchsize', default=2, type=int) + parser.add_argument('--train_split_size', default=None, type=float) + parser.add_argument('--tokenizer_type', default='t5_tokenizer', choices=['t5_tokenizer', 'bert_tokenizer']) + parser.add_argument('--text_column_name', default='text') + parser.add_argument('--remove_columns', nargs='+', default=[]) + return parent_args + + def __init__(self, args): + super().__init__() + self.save_hyperparameters(args) + if args.train_split_size is not None: + from data.fs_datasets import load_dataset + data_splits = load_dataset(args.train_data_path, num_proc=args.dataset_num_workers) + train_split = data_splits['train'] + test_split = data_splits['test'] + print('train:', train_split, '\ntest_data:', test_split) + self.train_dataset = UnsuperviseT5Dataset('', args, load_data_type=2, data=train_split) + self.test_dataset = UnsuperviseT5Dataset('', args, load_data_type=2, data=test_split) + else: + self.train_data = UnsuperviseT5Dataset(args.train_data_path, args, load_data_type=1) + + self.config = MT5Config.from_pretrained(args.pretrained_model_path) + self.noise_density = 0.15 + self.mean_noise_span_length = 3 + self.pad_token_id = self.config.pad_token_id + self.decoder_start_token_id = self.config.decoder_start_token_id + self.eos_token_id = self.config.eos_token_id + self.vocab_size = self.config.vocab_size + self.max_seq_length = args.max_seq_length + # 因为加载旧的spm里面已经包括了exrta_ids,但是T5Tokenizer会在spm的基础上再增加100个extra_ids,所以需要指定extra_ids=0 + if args.tokenizer_type == 't5_tokenizer' and args.new_vocab_path is not None: + self.tokenizer = MT5Tokenizer.from_pretrained(args.new_vocab_path, extra_ids=0) + # 如果是刚开始加载mt5,需要更新vocab_size为提取中英词之后的new_vocab_size + self.vocab_size = len(self.tokenizer) + + # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. + # To ensure that the input length is `max_seq_length`, we need to increase the maximum length + # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly. + self.expanded_inputs_length, self.targets_length = compute_input_and_target_lengths( + inputs_length=self.max_seq_length, + noise_density=self.noise_density, + mean_noise_span_length=self.mean_noise_span_length, + ) + + def train_dataloader(self): + from fengshen.data.universal_datamodule.universal_sampler import PretrainingSampler + from fengshen.data.universal_datamodule.universal_datamodule import get_consume_samples + # 采用自定义的sampler,确保继续训练能正确取到数据 + consumed_samples = get_consume_samples(self) + batch_sampler = PretrainingSampler( + total_samples=len(self.train_dataset), + consumed_samples=consumed_samples, + micro_batch_size=self.hparams.train_batchsize, + data_parallel_rank=self.trainer.global_rank, + data_parallel_size=self.trainer.world_size, + ) + return DataLoader( + self.train_dataset, + batch_sampler=batch_sampler, + pin_memory=True, + num_workers=self.hparams.dataloader_num_workers, + collate_fn=self.collate_fn, + ) + + def val_dataloader(self): + sampler = torch.utils.data.distributed.DistributedSampler( + self.test_dataset, shuffle=False) + return DataLoader( + self.test_dataset, + sampler=sampler, + shuffle=False, + batch_size=self.hparams.valid_batchsize, + pin_memory=True, + num_workers=self.hparams.dataloader_num_workers, + collate_fn=self.collate_fn, + ) + + def predict_dataloader(self): + sampler = torch.utils.data.distributed.DistributedSampler( + self.test_dataset, shuffle=False) + return DataLoader( + self.test_data, + sampler=sampler, + shuffle=False, + batch_size=self.hparams.valid_batchsize, + pin_memory=True, + num_workers=self.hparams.dataloader_num_workers, + collate_fn=self.collate_fn, + ) + + def collate_fn(self, examples): + # convert list to dict and tensorize input + batch = BatchEncoding( + {k: np.array([examples[i][k] for i in range(len(examples))]) + for k, v in examples[0].items()} + ) + + input_ids = np.array(batch['input_ids']) + batch_size, expanded_input_length = input_ids.shape + mask_indices = np.asarray([self.random_spans_noise_mask( + expanded_input_length) for i in range(batch_size)]) + labels_mask = ~mask_indices + + input_ids_sentinel = self.create_sentinel_ids( + mask_indices.astype(np.int8)) + labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8)) + + batch["input_ids"] = self.filter_input_ids( + input_ids, input_ids_sentinel) + batch["labels"] = self.filter_input_ids(input_ids, labels_sentinel) + + if batch["input_ids"].shape[-1] != self.max_seq_length: + raise ValueError( + f"`input_ids` are incorrectly preprocessed. `input_ids` length is \ + {batch['input_ids'].shape[-1]}, but should be {self.targets_length}." + ) + + if batch["labels"].shape[-1] != self.targets_length: + raise ValueError( + f"`labels` are incorrectly preprocessed. `labels` length is \ + {batch['labels'].shape[-1]}, but should be {self.targets_length}." + ) + + batch["decoder_input_ids"] = self.shift_tokens_right( + batch["labels"], self.pad_token_id, self.decoder_start_token_id + ) + + for k, v in batch.items(): + batch[k] = torch.tensor(v) + # print(k, batch[k], self.tokenizer.batch_decode(batch[k]), '\n', flush=True) + return batch + + def create_sentinel_ids(self, mask_indices): + """ + Sentinel ids creation given the indices that should be masked. + The start indices of each mask are replaced by the sentinel ids in increasing + order. Consecutive mask indices to be deleted are replaced with `-1`. + """ + start_indices = mask_indices - \ + np.roll(mask_indices, 1, axis=-1) * mask_indices + start_indices[:, 0] = mask_indices[:, 0] + + sentinel_ids = np.where(start_indices != 0, np.cumsum( + start_indices, axis=-1), start_indices) + sentinel_ids = np.where( + sentinel_ids != 0, (self.vocab_size - sentinel_ids), 0) + sentinel_ids -= mask_indices - start_indices + + return sentinel_ids + + def filter_input_ids(self, input_ids, sentinel_ids): + """ + Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. + This will reduce the sequence length from `expanded_inputs_length` to `input_length`. + """ + batch_size = input_ids.shape[0] + + input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids) + # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are + # masked tokens coming after sentinel tokens and should be removed + input_ids = input_ids_full[input_ids_full >= + 0].reshape((batch_size, -1)) + input_ids = np.concatenate( + [input_ids, np.full((batch_size, 1), self.eos_token_id, dtype=np.int32)], axis=-1 + ) + return input_ids + + # Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right + def shift_tokens_right(self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int) -> np.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = np.zeros_like(input_ids) + shifted_input_ids[:, 1:] = input_ids[:, :-1] + shifted_input_ids[:, 0] = decoder_start_token_id + + shifted_input_ids = np.where( + shifted_input_ids == -100, pad_token_id, shifted_input_ids) + return shifted_input_ids + + def random_spans_noise_mask(self, length): + """This function is copy of `random_spans_helper `__ . + Noise mask consisting of random spans of noise tokens. + The number of noise tokens and the number of noise spans and non-noise spans + are determined deterministically as follows: + num_noise_tokens = round(length * noise_density) + num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length) + Spans alternate between non-noise and noise, beginning with non-noise. + Subject to the above restrictions, all masks are equally likely. + Args: + length: an int32 scalar (length of the incoming token sequence) + noise_density: a float - approximate density of output mask + mean_noise_span_length: a number + Returns: + a boolean tensor with shape [length] + """ + + orig_length = length + + num_noise_tokens = int(np.round(length * self.noise_density)) + # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. + num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) + num_noise_spans = int( + np.round(num_noise_tokens / self.mean_noise_span_length)) + + # avoid degeneracy by ensuring positive number of noise spans + num_noise_spans = max(num_noise_spans, 1) + num_nonnoise_tokens = length - num_noise_tokens + + # pick the lengths of the noise spans and the non-noise spans + def _random_segmentation(num_items, num_segments): + """Partition a sequence of items randomly into non-empty segments. + Args: + num_items: an integer scalar > 0 + num_segments: an integer scalar in [1, num_items] + Returns: + a Tensor with shape [num_segments] containing positive integers that add + up to num_items + """ + mask_indices = np.arange(num_items - 1) < (num_segments - 1) + np.random.shuffle(mask_indices) + first_in_segment = np.pad(mask_indices, [[1, 0]]) + segment_id = np.cumsum(first_in_segment) + # count length of sub segments assuming that list is sorted + _, segment_length = np.unique(segment_id, return_counts=True) + return segment_length + + noise_span_lengths = _random_segmentation( + num_noise_tokens, num_noise_spans) + nonnoise_span_lengths = _random_segmentation( + num_nonnoise_tokens, num_noise_spans) + + interleaved_span_lengths = np.reshape( + np.stack([nonnoise_span_lengths, noise_span_lengths], + axis=1), [num_noise_spans * 2] + ) + span_starts = np.cumsum(interleaved_span_lengths)[:-1] + span_start_indicator = np.zeros((length,), dtype=np.int8) + span_start_indicator[span_starts] = True + span_num = np.cumsum(span_start_indicator) + is_noise = np.equal(span_num % 2, 1) + + return is_noise[:orig_length] + + +class TaskT5Dataset(Dataset): + def __init__(self, data_path, args): + super().__init__() + self.max_length = args.max_seq_length + if args.tokenizer_type == 't5_tokenizer': + self.tokenizer = MT5Tokenizer.from_pretrained(args.pretrained_model_path) + else: + self.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path) + self.data = self.load_data(data_path) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.encode(self.data[index]) + + def load_data(self, data_path): + samples = [] + with open(data_path, 'r', encoding='utf8') as f: + lines = f.readlines() + for line in tqdm(lines): + samples.append(json.loads(line)) + return samples + + def encode(self, item): + if item["textb"] != "": + text = item['question'] + ','.join(item['choice'])+'。' + f"""{item["texta"]}""" + f"""{item["textb"]}""" + else: + text = f"""{item["question"]}""" + ",".join(item["choice"]) + "。" + f"""{item["texta"]}""" + label = item['answer'] + encode_dict = self.tokenizer.encode_plus(text, max_length=self.max_length, padding='max_length', + truncation=True, return_tensors='pt') + decode_dict = self.tokenizer.encode_plus(label, max_length=16, padding='max_length', + truncation=True) + + answer_token = [] + max_label_len = 0 + choice_encode = [] # 用来确定模型生成的最大长度 + for a in item['choice']: + answer_encode = self.tokenizer.encode(a) + choice_encode.append(answer_encode) + if len(answer_encode) > max_label_len: + max_label_len = len(answer_encode) + for an in answer_encode: + if an not in answer_token: + answer_token.append(an) + + # bad_words_ids = [[i] for i in range(self.tokenizer.vocab_size) if i not in answer_token] #不生成这些token + + # while len(bad_words_ids) None: + super().__init__() + + if args.tokenizer_type == "t5_tokenizer": + self.tokenizer = MT5Tokenizer.from_pretrained( + args.pretrained_model_path) + if len(self.tokenizer) == 32596: + self.tokenizer.add_special_tokens(special_token_dict) + print( + "add special tokens to tokenizer,vocab size:", + len(self.tokenizer) + ) + self.model = MT5ForConditionalGeneration.from_pretrained( + args.pretrained_model_path + ) + self.model.resize_token_embeddings(len(self.tokenizer)) + self.model.save_pretrained(args.new_vocab_path) + self.tokenizer.save_pretrained( + args.new_vocab_path) + else: + self.tokenizer = BertTokenizer.from_pretrained( + args.pretrained_model_path) + + self.load_data_type = load_data_type + self.data_split = data + self.num_workers = args.preprocessing_num_workers + self.max_seq_length = args.max_seq_length + self.max_knowledge_length = args.max_knowledge_length + self.max_target_length = args.max_target_length + + # tokenizer config + self.config = MT5Config.from_pretrained(args.pretrained_model_path) + self.decoder_start_token_id = self.config.decoder_start_token_id + self.eos_token_id = self.config.eos_token_id + self.vocab_size = self.config.vocab_size + # print(self.tokenizer.decode([2])) + + # load from raw data or hf dataset + + if self.load_data_type == 0: + self.data = self.load_data(data_path) + elif self.load_data_type == 1: + self.data = self.load_packed_data(data_path) + else: # for testing + self.data = data_path + + def load_packed_data(self, data_path): + from fengshen.data.fs_datasets import load_dataset + + samples = load_dataset(data_path, + num_proc=self.num_workers)[self.data_split] + tokenized_samples = samples.map( + self.regular_tokenize, batched=False, + num_proc=self.num_workers + ) + + return tokenized_samples + + def load_data(self, data_path): + """ + load data from raw data + return untokoenized data + """ + from datasets import load_dataset + + ds = load_dataset("json", data_files=data_path)['train'] + samples = ds.map(self.regular_tokenize, batched=False, num_proc=self.num_workers + ) + return samples + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return len(self.data) + + def regular_tokenize(self, sample): + # print(len(sample['context'])) + context_ids = self.tokenizer( + sample["context"], + add_special_tokens=True, + return_attention_mask=False, + return_token_type_ids=True, + ) + + context_types = self.get_token_type( + sample["context"], context_ids["token_type_ids"] + ) + # print('context',sample['context']) + # print('context_ids',context_ids['input_ids']) + knowledge_ids = self.tokenizer.encode( + sample["knowledge"], add_special_tokens=False + ) + # print('knowledge_ids',knowledge_ids) + if isinstance(knowledge_ids, int): + knowledge_ids = [knowledge_ids] + target_ids = self.tokenizer.encode( + sample["target"], + add_special_tokens=False, + max_length=self.max_target_length - 1, + truncation=True, + ) + # print('target',sample['target']) + # print('target_ids',target_ids) + # print('decode target',self.tokenizer.decode(target_ids)) + # truncate + + knowledge_ids = ( + [self.tokenizer.convert_tokens_to_ids("[KNSTART]")] + + knowledge_ids[: self.max_knowledge_length - 2] + + [self.tokenizer.convert_tokens_to_ids("[KNEND]")] + ) + l_kn = len(knowledge_ids) + knowledge_types = [2] * l_kn + + flatten_context = [] + for line in context_ids["input_ids"]: + flatten_context.extend(line) + l_ct = min(len(flatten_context), self.max_seq_length - l_kn - 2) + context_ids = ( + [self.tokenizer.convert_tokens_to_ids("[CTSTART]")] + + flatten_context[-l_ct:] + + [self.tokenizer.convert_tokens_to_ids("[CTEND]")] + ) + + context_types = context_types[-l_ct:] + [0] + context_types.insert(0, context_types[0]) + assert len(context_ids) == len( + context_types + ), "len of context ids and token types unmatch, context:{},ids:{} types:{},len {}:{}".format( + sample["context"], + context_ids, + context_types, + len(context_ids), + len(context_types), + ) + + try: + target_ids = target_ids + [self.eos_token_id] + except exception: + print(sample["target"], target_ids, self.eos_token_id) + + tokenized = {} + tokenized["input_ids"] = np.array(context_ids + knowledge_ids, dtype=np.int32) + tokenized["token_types"] = np.array( + context_types + knowledge_types, dtype=np.int32 + ) + tokenized["attention_mask"] = np.ones( + len(context_types + knowledge_types), dtype=np.int8 + ) + tokenized["labels"] = np.array(target_ids, dtype=np.int32) + + return tokenized + + def get_token_type(self, context, tokentypes=None): + # token_type fail in tokenizer, all zero + context_token_types = [] + for i, line in enumerate(context): + if tokentypes: + if i % 2 == 0: + token_type = [0] * len(tokentypes[i]) + else: + token_type = [1] * len(tokentypes[i]) + else: + if i % 2 == 0: + token_type = [0] * (1 + len(line)) + else: + token_type = [1] * (1 + len(line)) + + context_token_types.extend(token_type) + + return context_token_types + + +class DialogDataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group("SuperviseT5DataModel") + parser.add_argument("--dataset_num_workers", default=8, type=int) + parser.add_argument("--dataloader_num_workers", default=4, type=int) + parser.add_argument("--train_data_path", default="dialog_4g_test", type=str) + parser.add_argument( + "--valid_data_path", default="wudao_180g_mt5_tokenized", type=str + ) + parser.add_argument("--train_batchsize", default=2, type=int) + parser.add_argument("--valid_batchsize", default=2, type=int) + parser.add_argument("--max_seq_length", default=512, type=int) + parser.add_argument("--max_knowledge_length", default=128, type=int) + parser.add_argument("--max_target_length", default=128, type=int) + + return parent_args + + def __init__(self, args): + super().__init__() + self.save_hyperparameters(args) + self.load_data(args) + self.epochs = args.max_epochs + + def load_data(self, args): + if args.train_split_size is not None: + from fengshen.data.fs_datasets import load_dataset + + data_splits = load_dataset( + args.train_data_path, num_proc=args.dataset_num_workers + ) + train_split = data_splits['train'] + test_split = data_splits['test'] + print('train:', train_split, '\ntest_data:', test_split) + self.train_dataset = DialogDataset( + args.train_data_path, args, load_data_type=1, data="train" + ) + self.test_dataset = DialogDataset( + args.train_data_path, args, load_data_type=1, data="test" + ) + else: + self.train_data = DialogDataset( + args.train_data_path, args, load_data_type=1 + ) + + self.config = MT5Config.from_pretrained(args.pretrained_model_path) + self.pad_token_id = self.config.pad_token_id + self.decoder_start_token_id = self.config.decoder_start_token_id + print("bos id:", self.decoder_start_token_id) + + def collate_fn(self, samples): + batch = { + k: [ + torch.tensor(samples[i][k], dtype=torch.int64) + for i in range(len(samples)) + ] + for k in ["input_ids", "token_types", "attention_mask", "labels"] + } + + # print(batch) + for k, v in batch.items(): + if k != "labels": + batch[k] = pad_sequence( + v, batch_first=True, padding_value=self.pad_token_id + ) + else: + batch[k] = pad_sequence(v, batch_first=True, padding_value=-100) + batch["decoder_input_ids"] = torch.tensor( + self.shift_tokens_right( + batch["labels"], self.pad_token_id, self.decoder_start_token_id + ), + dtype=torch.long, + ) + return batch + + def shift_tokens_right( + self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int + ) -> np.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = np.zeros_like(input_ids) + shifted_input_ids[:, 1:] = input_ids[:, :-1] + shifted_input_ids[:, 0] = decoder_start_token_id + + shifted_input_ids = np.where( + shifted_input_ids == -100, pad_token_id, shifted_input_ids + ) + return shifted_input_ids + + def train_dataloader(self): + from fengshen.data.universal_datamodule.universal_sampler import ( + PretrainingRandomSampler, + ) + from fengshen.data.universal_datamodule.universal_datamodule import ( + get_consume_samples, + ) + + # 采用自定义的sampler,确保继续训练能正确取到数据 + consumed_samples = get_consume_samples(self) + batch_sampler = PretrainingRandomSampler( + epoch=self.epochs, + total_samples=len(self.train_dataset), + consumed_samples=consumed_samples, + micro_batch_size=self.hparams.train_batchsize, + data_parallel_rank=self.trainer.global_rank, # gpu idx + data_parallel_size=self.trainer.world_size, # gpu num + ) + return DataLoader( + self.train_dataset, + batch_sampler=batch_sampler, + pin_memory=True, + num_workers=self.hparams.dataloader_num_workers, + collate_fn=self.collate_fn, + ) + + def val_dataloader(self): + sampler = torch.utils.data.distributed.DistributedSampler( + self.test_dataset, shuffle=False + ) + return DataLoader( + self.test_dataset, + sampler=sampler, + shuffle=False, + batch_size=self.hparams.valid_batchsize, + pin_memory=True, + num_workers=self.hparams.dataloader_num_workers, + collate_fn=self.collate_fn, + ) + + def predict_dataloader(self): + sampler = torch.utils.data.distributed.DistributedSampler( + self.test_dataset, shuffle=False + ) + return DataLoader( + self.test_dataset, + sampler=sampler, + shuffle=False, + batch_size=self.hparams.valid_batchsize, + pin_memory=True, + num_workers=self.hparams.dataloader_num_workers, + collate_fn=self.collate_fn, + ) + + +if __name__ == "__main__": + # test + import argparse + + total_parser = argparse.ArgumentParser("DATASET parser") + total_parser.add_argument( + "--tokenizer_type", + default="t5_tokenizer", + choices=["bert_tokenizer", "t5_tokenizer"], + ) + total_parser.add_argument("--preprocessing_num_workers", default="10", type=int) + total_parser.add_argument( + "--new_vocab_path", + default="/cognitive_comp/hejunqing/projects/Dialog_pretrain/randeng_t5_newvocab_784M", + type=str, + ) + total_parser.add_argument("--train_split_size", default=0.995, type=int) + total_parser.add_argument( + "--pretrained_model_path", + default="/cognitive_comp/hejunqing/projects/Dialog_pretrain/randeng_t5_newvocab_784M", + ) + total_parser = DialogDataModel.add_data_specific_args(total_parser) + args = total_parser.parse_args() + dl = DialogDataModel(args) + + for i in range(5): + for batch in dl.train_dataloader(): + print(batch) + print(batch["input_ids"]) + print(batch["token_types"]) + print(batch["decoder_input_ids"]) + print(batch["labels"]) + + print("test finish") diff --git a/fengshen/data/taiyi_stable_diffusion_datasets/taiyi_datasets.py b/fengshen/data/taiyi_stable_diffusion_datasets/taiyi_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..73e1071ac27c9839030734fe664abbcfef08d96b --- /dev/null +++ b/fengshen/data/taiyi_stable_diffusion_datasets/taiyi_datasets.py @@ -0,0 +1,173 @@ +from torch.utils.data import Dataset, ConcatDataset +import os +from concurrent.futures import ProcessPoolExecutor +import pandas as pd + + +def add_data_args(parent_args): + parser = parent_args.add_argument_group('taiyi stable diffusion data args') + # 支持传入多个路径,分别加载 + parser.add_argument( + "--datasets_path", type=str, default=None, required=True, nargs='+', + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--datasets_type", type=str, default=None, required=True, choices=['txt', 'csv', 'fs_datasets'], nargs='+', + help="dataset type, txt or csv, same len as datasets_path", + ) + parser.add_argument( + "--resolution", type=int, default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", default=False, + help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument("--thres", type=float, default=0.2) + return parent_args + + +class TXTDataset(Dataset): + # 添加Txt数据集读取,主要是针对Zero23m数据集。 + def __init__(self, + foloder_name, + thres=0.2): + super().__init__() + # print(f'Loading folder data from {foloder_name}.') + self.image_paths = [] + ''' + 暂时没有开源这部分文件 + score_data = pd.read_csv(os.path.join(foloder_name, 'score.csv')) + img_path2score = {score_data['image_path'][i]: score_data['score'][i] + for i in range(len(score_data))} + ''' + # print(img_path2score) + # 这里都存的是地址,避免初始化时间过多。 + for each_file in os.listdir(foloder_name): + if each_file.endswith('.jpg'): + self.image_paths.append(os.path.join(foloder_name, each_file)) + + # print('Done loading data. Len of images:', len(self.image_paths)) + + def __len__(self): + return len(self.image_paths) + + def __getitem__(self, idx): + img_path = str(self.image_paths[idx]) + caption_path = img_path.replace('.jpg', '.txt') # 图片名称和文本名称一致。 + with open(caption_path, 'r') as f: + caption = f.read() + return {'img_path': img_path, 'caption': caption} + + +# NOTE 加速读取数据,直接用原版的,在外部使用并行读取策略。30min->3min +class CSVDataset(Dataset): + def __init__(self, + input_filename, + image_root, + img_key, + caption_key, + thres=0.2): + super().__init__() + # logging.debug(f'Loading csv data from {input_filename}.') + print(f'Loading csv data from {input_filename}.') + self.images = [] + self.captions = [] + + if input_filename.endswith('.csv'): + # print(f"Load Data from{input_filename}") + df = pd.read_csv(input_filename, index_col=0, on_bad_lines='skip') + print(f'file {input_filename} datalen {len(df)}') + # 这个图片的路径也需要根据数据集的结构稍微做点修改 + self.images.extend(df[img_key].tolist()) + self.captions.extend(df[caption_key].tolist()) + self.image_root = image_root + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img_path = os.path.join(self.image_root, str(self.images[idx])) + return {'img_path': img_path, 'caption': self.captions[idx]} + + +def if_final_dir(path: str) -> bool: + # 如果当前目录有一个文件,那就算是终极目录 + for f in os.scandir(path): + if f.is_file(): + return True + return False + + +def process_pool_read_txt_dataset(args, + input_root=None, + thres=0.2): + p = ProcessPoolExecutor(max_workers=20) + all_datasets = [] + res = [] + + # 遍历该目录下所有的子目录 + def traversal_files(path: str): + list_subfolders_with_paths = [f.path for f in os.scandir(path) if f.is_dir()] + for dir_path in list_subfolders_with_paths: + if if_final_dir(dir_path): + res.append(p.submit(TXTDataset, + dir_path, + thres)) + else: + traversal_files(dir_path) + traversal_files(input_root) + p.shutdown() + for future in res: + all_datasets.append(future.result()) + dataset = ConcatDataset(all_datasets) + return dataset + + +def process_pool_read_csv_dataset(args, + input_root, + thres=0.20): + # here input_filename is a directory containing a CSV file + all_csvs = os.listdir(os.path.join(input_root, 'release')) + image_root = os.path.join(input_root, 'images') + # csv_with_score = [each for each in all_csvs if 'score' in each] + all_datasets = [] + res = [] + p = ProcessPoolExecutor(max_workers=150) + for path in all_csvs: + each_csv_path = os.path.join(input_root, 'release', path) + res.append(p.submit(CSVDataset, + each_csv_path, + image_root, + img_key="name", + caption_key="caption", + thres=thres)) + p.shutdown() + for future in res: + all_datasets.append(future.result()) + dataset = ConcatDataset(all_datasets) + return dataset + + +def load_data(args, global_rank=0): + assert len(args.datasets_path) == len(args.datasets_type), \ + "datasets_path num not equal to datasets_type" + all_datasets = [] + for path, type in zip(args.datasets_path, args.datasets_type): + if type == 'txt': + all_datasets.append(process_pool_read_txt_dataset( + args, input_root=path, thres=args.thres)) + elif type == 'csv': + all_datasets.append(process_pool_read_csv_dataset( + args, input_root=path, thres=args.thres)) + elif type == 'fs_datasets': + from fengshen.data.fs_datasets import load_dataset + all_datasets.append(load_dataset(path, num_proc=args.num_workers, + thres=args.thres, global_rank=global_rank)['train']) + else: + raise ValueError('unsupport dataset type: %s' % type) + print(f'load datasset {type} {path} len {len(all_datasets[-1])}') + return {'train': ConcatDataset(all_datasets)} diff --git a/fengshen/data/task_dataloader/__init__.py b/fengshen/data/task_dataloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..25810ab9ab20ad36f72ba20b31768341e78e2676 --- /dev/null +++ b/fengshen/data/task_dataloader/__init__.py @@ -0,0 +1,3 @@ +# coding=utf-8 +from .task_datasets import LCSTSDataModel, LCSTSDataset +__all__ = ['LCSTSDataModel', 'LCSTSDataset'] diff --git a/fengshen/data/task_dataloader/medicalQADataset.py b/fengshen/data/task_dataloader/medicalQADataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3d76ed583c7d150769c81d830293909e1c110485 --- /dev/null +++ b/fengshen/data/task_dataloader/medicalQADataset.py @@ -0,0 +1,137 @@ +# coding=utf8 +import os +import pytorch_lightning as pl +from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm +from transformers import AutoTokenizer + + +class GPT2QADataset(Dataset): + ''' + Dataset Used for yuyuan medical qa task. + Just surpport small datasets, when deal with large datasets it may be slowly. + for large datasets please use mmapdatasets(doing) + ''' + + def __init__(self, data_path, name, args): + super().__init__() + self.tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_path) + if self.tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'}) + self.data_size = os.path.getsize(data_path)/1024/1024/1024 + self.data_type_name = name + self.data = self.load_data(data_path) + self.max_seq_length = args.max_seq_length + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.encode(self.data[index]) + + def load_data(self, data_path): + # 有进度条展示 + if self.data_size <= 5: + with open(data_path, "rt", encoding='utf8') as f: + lines = f.readlines() + total_num = len(lines) + data_gen = lines + else: + data_gen = open(data_path, "rt", encoding='utf8') + total_num = None + + data = [] + with tqdm(total=total_num, desc=f'{self.data_type_name}处理进度', mininterval=0.3) as bar: + for idx, line in enumerate(data_gen): + data.append(self.data_parse(line)) + bar.update() + + if self.data_size > 5: + data_gen.close() + return data + + def data_parse(self, line): + """ + 解析不同格式的数据 + """ + dic = eval(line.strip()) + return dic + + def encode(self, item): + """ + 将数据转换成模型训练的输入 + """ + inputs_dict = self.tokenizer.encode_plus(item['Question']+item['answer'], + max_length=self.max_seq_length, padding='max_length', + truncation=True, return_tensors='pt') + target = inputs_dict['input_ids'] + labels = target.clone().detach() + labels[target == self.tokenizer.pad_token_id] = -100 + return { + "input_ids": inputs_dict['input_ids'].squeeze(), + "attention_mask": inputs_dict['attention_mask'].squeeze(), + "labels": labels.squeeze(), + "question": item['Question'], + "answer": item['answer'] + } + + +class GPT2QADataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('GPT2QADataModel') + parser.add_argument('--data_dir', type=str, required=True) + parser.add_argument('--num_workers', default=2, type=int) + parser.add_argument('--train_data', default='train.txt', type=str) + parser.add_argument('--valid_data', default='valid.txt', type=str) + parser.add_argument('--test_data', default='test.txt', type=str) + parser.add_argument('--train_batchsize', type=int, required=True) + parser.add_argument('--valid_batchsize', type=int, required=True) + parser.add_argument('--max_seq_length', default=1024, type=int) + return parent_args + + def __init__(self, args): + super().__init__() + self.args = args + self.train_batchsize = args.train_batchsize + self.valid_batchsize = args.valid_batchsize + if not args.do_eval_only: + self.train_data = GPT2QADataset(os.path.join( + args.data_dir, args.train_data), '训练集', args) + self.valid_data = GPT2QADataset(os.path.join( + args.data_dir, args.valid_data), '验证集', args) + self.test_data = GPT2QADataset(os.path.join( + args.data_dir, args.test_data), '测试集', args) + + def train_dataloader(self): + return DataLoader( + self.train_data, shuffle=True, + batch_size=self.train_batchsize, + pin_memory=False, num_workers=self.args.num_workers) + + def val_dataloader(self): + return DataLoader(self.valid_data, shuffle=False, + batch_size=self.valid_batchsize, + pin_memory=False, num_workers=self.args.num_workers) + + def predict_dataloader(self): + return DataLoader(self.test_data, shuffle=False, + batch_size=self.valid_batchsize, pin_memory=False, + num_workers=self.args.num_workers) + + +if __name__ == '__main__': + import argparse + modelfile = '/cognitive_comp/wuziwei/pretrained_model_hf/medical_v2' + datafile = '/cognitive_comp/wuziwei/task-data/medical_qa/medical_qa_train.txt' + parser = argparse.ArgumentParser(description='hf test', allow_abbrev=False) + group = parser.add_argument_group(title='test args') + group.add_argument('--pretrained-model-path', type=str, default=modelfile, + help='Number of transformer layers.') + group.add_argument('--max-seq-length', type=int, default=1024) + args = parser.parse_args() + + testml = GPT2QADataset(datafile, 'medical_qa', args=args) + + print(testml[10]) diff --git a/fengshen/data/task_dataloader/task_datasets.py b/fengshen/data/task_dataloader/task_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..a8fe7bcf732c61725853df92d9422f207d55f785 --- /dev/null +++ b/fengshen/data/task_dataloader/task_datasets.py @@ -0,0 +1,206 @@ +# coding=utf8 +from torch.utils.data import Dataset, DataLoader +from tqdm import tqdm +from transformers import AutoTokenizer +import json +import torch +import pytorch_lightning as pl +import os + + +class AbstractCollator: + """ + collector for summary task + """ + + def __init__(self, tokenizer, max_enc_length, max_dec_length, prompt): + self.tokenizer = tokenizer + self.max_enc_length = max_enc_length + self.max_dec_length = max_dec_length + self.prompt = prompt + + def __call__(self, samples): + + labels = [] + attn_mask = [] + # decoder_attn_mask = [] + source_inputs = [] + for sample in samples: + encode_dict = self.tokenizer.encode_plus( + self.prompt + sample['text'], + max_length=self.max_enc_length, + padding='max_length', + truncation=True, + return_tensors='pt') + decode_dict = self.tokenizer.encode_plus( + sample['summary'], + max_length=self.max_dec_length, + padding='max_length', + truncation=True, + return_tensors='pt') + source_inputs.append(encode_dict['input_ids'].squeeze()) + labels.append(decode_dict['input_ids'].squeeze()) + attn_mask.append(encode_dict['attention_mask'].squeeze()) + # decoder_attn_mask.append(decode_dict['attention_mask'].squeeze()) + # labels = torch.tensor(decode_dict['input']) + + source_inputs = torch.stack(source_inputs) + labels = torch.stack(labels) + attn_mask = torch.stack(attn_mask) + # decoder_attn_mask = torch.stack(decoder_attn_mask) + # decode_input_idxs = shift_tokens_right(labels, self.tokenizer.pad_token_id, self.tokenizer.pad_token_id) + end_token_index = torch.where(labels == self.tokenizer.eos_token_id)[1] + for idx, end_idx in enumerate(end_token_index): + labels[idx][end_idx + 1:] = -100 + + return { + "input_ids": source_inputs, + "attention_mask": attn_mask, + "labels": labels, + "text": [sample['text'] for sample in samples], + "summary": [sample['summary'] for sample in samples] + } + + +class LCSTSDataset(Dataset): + ''' + Dataset Used for LCSTS summary task. + ''' + + def __init__(self, data_path, args): + super().__init__() + self.tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_path, use_fast=False) + self.data = self.load_data(data_path) + self.prompt = args.prompt + self.max_enc_length = args.max_enc_length + self.max_dec_length = args.max_dec_length + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.encode(self.data[index]) + + def load_data(self, data_path): + with open(data_path, "r", encoding='utf8') as f: + lines = f.readlines() + samples = [] + for line in tqdm(lines): + obj = json.loads(line) + source = obj['text'] + target = obj['summary'] + samples.append({ + "text": source, + "summary": target + }) + return samples + + def cal_data(self, data_path): + with open(data_path, "r", encoding='utf8') as f: + lines = f.readlines() + samples = [] + enc_sizes = [] + dec_sizes = [] + for line in tqdm(lines): + obj = json.loads(line.strip()) + source = obj['text'] + target = obj['summary'] + enc_input_ids = self.tokenizer.encode(source) + target = self.tokenizer.encode(target) + enc_sizes.append(len(enc_input_ids)) + dec_sizes.append(len(target)-1) + samples.append({ + "enc_input_ids": enc_input_ids, + "dec_input_ids": target[:-1], + "label_ids": target[1:] + }) + max_enc_len = max(enc_sizes) + max_dec_len = max(dec_sizes) + import numpy as np + # mean of len(enc_input_ids): 74.68041911345998 + # mean of len(dec_input_ids): 14.02265483791283 + # max of len(enc_input_ids): 132 + # max of len(dec_input_ids): 31 + print('mean of len(enc_input_ids):', np.mean(enc_sizes), + 'mean of len(dec_input_ids):', np.mean(dec_sizes), + 'max of len(enc_input_ids):', max_enc_len, + 'max of len(dec_input_ids):', max_dec_len) + return samples + + def encode(self, item): + encode_dict = self.tokenizer.encode_plus( + self.prompt + item['text'], + max_length=self.max_enc_length, + padding='max_length', + truncation=True, + return_tensors='pt') + decode_dict = self.tokenizer.encode_plus( + item['summary'], + max_length=self.max_dec_length, + padding='max_length', + truncation=True) + + target = decode_dict['input_ids'] + # print('encode_dict shape:', encode_dict['input_ids'].shape) + labels = torch.tensor(target) + labels[target == self.tokenizer.pad_token_id] = -100 + return { + "input_ids": encode_dict['input_ids'].squeeze(), + "attention_mask": encode_dict['attention_mask'].squeeze(), + "labels": labels.squeeze(), + "text": item['text'], + "summary": item['summary'] + } + + +class LCSTSDataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('LCSTSDataModel') + parser.add_argument( + '--data_dir', default='/cognitive_comp/ganruyi/data_datasets_LCSTS_LCSTS/', type=str) + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--train_data', default='train.jsonl', type=str) + parser.add_argument('--valid_data', default='valid.jsonl', type=str) + parser.add_argument('--test_data', default='test_public.jsonl', type=str) + parser.add_argument('--train_batchsize', default=128, type=int) + parser.add_argument('--valid_batchsize', default=128, type=int) + parser.add_argument('--max_enc_length', default=128, type=int) + parser.add_argument('--max_dec_length', default=30, type=int) + parser.add_argument('--prompt', default='summarize:', type=str) + return parent_args + + def __init__(self, args): + super().__init__() + self.args = args + self.train_batchsize = args.train_batchsize + self.valid_batchsize = args.valid_batchsize + if not args.do_eval_only: + self.train_data = LCSTSDataset(os.path.join( + args.data_dir, args.train_data), args) + self.valid_data = LCSTSDataset(os.path.join( + args.data_dir, args.valid_data), args) + self.test_data = LCSTSDataset(os.path.join( + args.data_dir, args.test_data), args) + + def train_dataloader(self): + return DataLoader(self.train_data, + shuffle=True, + batch_size=self.train_batchsize, + pin_memory=False, + num_workers=self.args.num_workers) + + def val_dataloader(self): + return DataLoader(self.valid_data, + shuffle=False, + batch_size=self.valid_batchsize, + pin_memory=False, + num_workers=self.args.num_workers) + + def predict_dataloader(self): + return DataLoader(self.test_data, + shuffle=False, + batch_size=self.valid_batchsize, + pin_memory=False, + num_workers=self.args.num_workers) diff --git a/fengshen/data/universal_datamodule/__init__.py b/fengshen/data/universal_datamodule/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..68169d26a8424ae877b5c7efc2b7be2e761cd3cb --- /dev/null +++ b/fengshen/data/universal_datamodule/__init__.py @@ -0,0 +1,4 @@ +from .universal_datamodule import UniversalDataModule +from .universal_sampler import PretrainingSampler, PretrainingRandomSampler + +__all__ = ['UniversalDataModule', 'PretrainingSampler', 'PretrainingRandomSampler'] diff --git a/fengshen/data/universal_datamodule/universal_datamodule.py b/fengshen/data/universal_datamodule/universal_datamodule.py new file mode 100644 index 0000000000000000000000000000000000000000..240557694e97197f08a310351eb6206973107c4d --- /dev/null +++ b/fengshen/data/universal_datamodule/universal_datamodule.py @@ -0,0 +1,165 @@ +from pytorch_lightning import LightningDataModule +from typing import Optional + +from torch.utils.data import DataLoader, DistributedSampler + + +def get_consume_samples(data_model: LightningDataModule) -> int: + if hasattr(data_model.trainer.lightning_module, 'consumed_samples'): + consumed_samples = data_model.trainer.lightning_module.consumed_samples + print('get consumed samples from model: {}'.format(consumed_samples)) + else: + world_size = data_model.trainer.world_size + consumed_samples = max(0, data_model.trainer.global_step - 1) * \ + data_model.hparams.train_batchsize * world_size * data_model.trainer.accumulate_grad_batches + print('calculate consumed samples: {}'.format(consumed_samples)) + return consumed_samples + + +class UniversalDataModule(LightningDataModule): + @ staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('Universal DataModule') + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--dataloader_workers', default=2, type=int) + parser.add_argument('--train_batchsize', default=16, type=int) + parser.add_argument('--val_batchsize', default=16, type=int) + parser.add_argument('--test_batchsize', default=16, type=int) + parser.add_argument('--datasets_name', type=str, default=None) + parser.add_argument('--train_datasets_field', type=str, default='train') + parser.add_argument('--val_datasets_field', type=str, default='validation') + parser.add_argument('--test_datasets_field', type=str, default='test') + parser.add_argument('--train_file', type=str, default=None) + parser.add_argument('--val_file', type=str, default=None) + parser.add_argument('--test_file', type=str, default=None) + parser.add_argument('--raw_file_type', type=str, default='json') + parser.add_argument('--sampler_type', type=str, + choices=['single', + 'random'], + default='random') + return parent_args + + def __init__( + self, + tokenizer, + collate_fn, + args, + datasets=None, + **kwargs, + ): + super().__init__() + # 如果不传入datasets的名字,则可以在对象外部替换内部的datasets为模型需要的 + if datasets is not None: + self.datasets = datasets + elif args.datasets_name is not None: + from fengshen.data.fs_datasets import load_dataset + print('---------begin to load datasets {}'.format(args.datasets_name)) + self.datasets = load_dataset( + args.datasets_name, num_proc=args.num_workers) + print('---------ending load datasets {}'.format(args.datasets_name)) + else: + print('---------begin to load datasets from local file') + from datasets import load_dataset + self.datasets = load_dataset(args.raw_file_type, + data_files={ + args.train_datasets_field: args.train_file, + args.val_datasets_field: args.val_file, + args.test_datasets_field: args.test_file}) + print('---------end to load datasets from local file') + + self.tokenizer = tokenizer + self.collate_fn = collate_fn + self.save_hyperparameters(args) + + def get_custom_sampler(self, ds): + from .universal_sampler import PretrainingRandomSampler + from .universal_sampler import PretrainingSampler + world_size = self.trainer.world_size + consumed_samples = get_consume_samples(self) + # use the user default sampler + if self.hparams.sampler_type == 'random': + return PretrainingRandomSampler( + total_samples=len(ds), + # consumed_samples cal by global steps + consumed_samples=consumed_samples, + micro_batch_size=self.hparams.train_batchsize, + data_parallel_rank=self.trainer.global_rank, + data_parallel_size=world_size, + epoch=self.trainer.current_epoch, + ) + elif self.hparams.sampler_type == 'single': + return PretrainingSampler( + total_samples=len(ds), + # consumed_samples cal by global steps + consumed_samples=consumed_samples, + micro_batch_size=self.hparams.train_batchsize, + data_parallel_rank=self.trainer.global_rank, + data_parallel_size=world_size, + ) + else: + raise Exception('Unknown sampler type: {}'.format(self.hparams.sampler_type)) + + def setup(self, stage: Optional[str] = None) -> None: + return + + def train_dataloader(self): + ds = self.datasets[self.hparams.train_datasets_field] + + collate_fn = self.collate_fn + if hasattr(ds, 'collate_fn'): + collate_fn = ds.collate_fn + + if self.hparams.replace_sampler_ddp is False: + return DataLoader( + ds, + batch_sampler=self.get_custom_sampler(ds), + num_workers=self.hparams.dataloader_workers, + collate_fn=collate_fn, + pin_memory=True, + ) + return DataLoader( + ds, + batch_size=self.hparams.train_batchsize, + num_workers=self.hparams.dataloader_workers, + collate_fn=collate_fn, + pin_memory=True, + ) + + def val_dataloader(self): + ds = self.datasets[self.hparams.val_datasets_field] + collate_fn = self.collate_fn + if hasattr(ds, 'collate_fn'): + collate_fn = ds.collate_fn + + return DataLoader( + ds, + batch_size=self.hparams.val_batchsize, + shuffle=False, + num_workers=self.hparams.dataloader_workers, + collate_fn=collate_fn, + sampler=DistributedSampler( + ds, shuffle=False), + pin_memory=True, + ) + + # return DataLoader( + # ds, shuffle=False, batch_size=self.hparams.val_batchsize, pin_memory=False, collate_fn=collate_fn, + # ) + + def test_dataloader(self): + ds = self.datasets[self.hparams.test_datasets_field] + + collate_fn = self.collate_fn + if collate_fn is None and hasattr(ds, 'collater'): + collate_fn = ds.collater + + return DataLoader( + ds, + batch_size=self.hparams.test_batchsize, + shuffle=False, + num_workers=self.hparams.dataloader_workers, + collate_fn=collate_fn, + sampler=DistributedSampler( + ds, shuffle=False), + pin_memory=True, + ) diff --git a/fengshen/data/universal_datamodule/universal_sampler.py b/fengshen/data/universal_datamodule/universal_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..86db3016d0f9795f5c8e501da2ff55c6e34e7222 --- /dev/null +++ b/fengshen/data/universal_datamodule/universal_sampler.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Dataloaders.""" + + +import torch + + +class PretrainingSampler: + + def __init__(self, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size, drop_last=True): + # Keep a copy of input params for later use. + self.total_samples = total_samples + self.consumed_samples = consumed_samples + self.micro_batch_size = micro_batch_size + self.data_parallel_rank = data_parallel_rank + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size + self.drop_last = drop_last + + # Sanity checks. + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) + assert self.consumed_samples < self.total_samples, \ + 'no samples left to consume: {}, {}'.format(self.consumed_samples, + self.total_samples) + assert self.micro_batch_size > 0 + assert data_parallel_size > 0 + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) + + def __len__(self): + return self.total_samples // self.micro_batch_times_data_parallel_size + + def get_start_end_idx(self): + start_idx = self.data_parallel_rank * self.micro_batch_size + end_idx = start_idx + self.micro_batch_size + return start_idx, end_idx + + def __iter__(self): + batch = [] + # Last batch will be dropped if drop_last is not set False + for idx in range(self.consumed_samples, self.total_samples): + batch.append(idx) + if len(batch) == self.micro_batch_times_data_parallel_size: + start_idx, end_idx = self.get_start_end_idx() + yield batch[start_idx:end_idx] + batch = [] + + # Check the last partial batch and see drop_last is set + if len(batch) > 0 and not self.drop_last: + start_idx, end_idx = self.get_start_end_idx() + yield batch[start_idx:end_idx] + + +class PretrainingRandomSampler: + + def __init__(self, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size, epoch): + # Keep a copy of input params for later use. + self.total_samples = total_samples + self.consumed_samples = consumed_samples + self.micro_batch_size = micro_batch_size + self.data_parallel_rank = data_parallel_rank + self.data_parallel_size = data_parallel_size + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size + self.last_batch_size = \ + self.total_samples % self.micro_batch_times_data_parallel_size + self.epoch = epoch + + # Sanity checks. + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) + assert self.micro_batch_size > 0 + assert data_parallel_size > 0 + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) + + def __len__(self): + return self.total_samples // self.micro_batch_times_data_parallel_size + + def __iter__(self): + active_total_samples = self.total_samples - self.last_batch_size + current_epoch_samples = self.consumed_samples % active_total_samples + assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 + + # data sharding and random sampling + bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ + * self.micro_batch_size + bucket_offset = current_epoch_samples // self.data_parallel_size + start_idx = self.data_parallel_rank * bucket_size + + g = torch.Generator() + g.manual_seed(self.epoch) + random_idx = torch.randperm(bucket_size, generator=g).tolist() + idx_range = [start_idx + x for x in random_idx[bucket_offset:]] + + batch = [] + # Last batch if not complete will be dropped. + for idx in idx_range: + batch.append(idx) + if len(batch) == self.micro_batch_size: + self.consumed_samples += self.micro_batch_times_data_parallel_size + yield batch + batch = [] + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/fengshen/examples/DAVAE/generate.py b/fengshen/examples/DAVAE/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..5d5aebfeb8d68d77bc6c0045ea3c36d789de17ec --- /dev/null +++ b/fengshen/examples/DAVAE/generate.py @@ -0,0 +1,36 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : generate.py +@Time : 2022/11/04 19:17 +@Author : Liang Yuxin +@Version : 1.0 +@Contact : liangyuxin@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' +# here put the import lib + +import torch +from fengshen.models.DAVAE.DAVAEModel import DAVAEModel +from transformers import BertTokenizer,T5Tokenizer +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +encoder_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Randeng-DAVAE-1.2B-General-Chinese") +decoder_tokenizer = T5Tokenizer.from_pretrained("IDEA-CCNL/Randeng-DAVAE-1.2B-General-Chinese", eos_token = '<|endoftext|>', pad_token = '',extra_ids=0) +decoder_tokenizer.add_special_tokens({'bos_token':''}) +vae_model = DAVAEModel.from_pretrained("IDEA-CCNL/Randeng-DAVAE-1.2B-General-Chinese").to(device) +input_texts = [ + "针对电力系统中的混沌振荡对整个互联电网的危害问题,提出了一种基于非线性光滑函数的滑模控制方法.", + "超市面积不算大.挺方便附近的居民购买的. 生活用品也比较齐全.价格适用中.", +] +output_texts = vae_model.simulate_batch(encoder_tokenizer,decoder_tokenizer,input_texts) +print(output_texts) diff --git a/fengshen/examples/FastDemo/README.md b/fengshen/examples/FastDemo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..132519b95da3fd35f4c4fb6aae5d8c44faad3a42 --- /dev/null +++ b/fengshen/examples/FastDemo/README.md @@ -0,0 +1,105 @@ +# 「streamlit」快速搭建你的算法demo +在搭建demo之前,首先得做好这些准备工作: +- 模型训练完毕 +- 模型的入参确定 +- 安装streamlit库,`pip install streamlit` 就可以安装。 + +streamlit脚本的启动方式是 `streamlit run demo.py`,很简单就启动了一个demo页面,页面会随着脚本代码的改变实时刷新的。所以在没有经验的时候,可以创建一个demo.py的文件,照着下面的教程一步一步添加代码,看页面的展示情况。下面开始上干货,具体细节在代码注释中有说明! + +### 第一步 导包 +```python +import streamlit as st +# 其他包更具你的需要导入 +``` +[streamlit](https://streamlit.io)是一个用于构建机器学习、深度学习、数据可视化demo的python框架。它不需要你有web开发的经验,会写python就可以高效的开发你的demo。 + +### 第二步 页面导航信息以及布局配置 + +```python +st.set_page_config( + page_title="余元医疗问答", # 页面标签标题 + page_icon=":shark:", # 页面标签图标 + layout="wide", # 页面的布局 + initial_sidebar_state="expanded", # 左侧的sidebar的布局方式 + # 配置菜单按钮的信息 + menu_items={ + 'Get Help': 'https://www.extremelycoolapp.com/help', + 'Report a bug': "https://www.extremelycoolapp.com/bug", + 'About': "# This is a header. This is an *extremely* cool app!" + } + ) +``` +这一步可以省略,如果想让app更加个性化,可以添加这些设置。 + +### 第三步 设置demo标题 +```python +st.title('Demo for MedicalQA') +``` +streamlit的每一个小组件对应于页面都有一个默认的样式展示。 + +### 第四步 配置demo的参数 + +```python +# 此处是用的sidebar,侧边栏作为参数配置模块 +st.sidebar.header("参数配置") +# 这里是在sidebar里面创建了表单,每个表单一定有一个标题和提交按钮 +sbform = st.sidebar.form("固定参数设置") +# slider是滑动条组建,可以配置数值型参数 +n_sample = sbform.slider("设置返回条数",min_value=1,max_value=10,value=3) +text_length = sbform.slider('生成长度:',min_value=32,max_value=512,value=64,step=32) +text_level = sbform.slider('文本多样性:',min_value=0.1,max_value=1.0,value=0.9,step=0.1) +# number_input也可以配置数值型参数 +model_id = sbform.number_input('选择模型号:',min_value=0,max_value=13,value=13,step=1) +# selectbox选择组建,只能选择配置的选项 +trans = sbform.selectbox('选择翻译内核',['百度通用','医疗生物']) +# 提交表单的配置,这些参数的赋值才生效 +sbform.form_submit_button("提交配置") + +# 这里是页面中的参数配置,也是demo的主体之一 +form = st.form("参数设置") +# 本demo是qa demo,所以要录入用户的文本输入,text_input组建可以实现 +input_text = form.text_input('请输入你的问题:',value='',placeholder='例如:糖尿病的症状有哪些?') +form.form_submit_button("提交") +``` +以上就把demo的参数基本配置完成了。 + +### 第五步 模型预测 +```python +# 定义一个前向预测的方法 +# @st.cache(suppress_st_warning=True) +def generate_qa(input_text,n_sample,model_id='7',length=64,translator='baidu',level=0.7): + # 这里我们是把模型用fastapi搭建了一个api服务 + URL = 'http://192.168.190.63:6605/qa' + data = { + "text":input_text,"n_sample":n_sample, + "model_id":model_id,"length":length, + 'translator':translator,'level':level + } + r = requests.get(URL,params=data) + return r.text +# 模型预测结果 +results = generate_qa(input_text,n_sample,model_id=str(model_id), + translator=translator,length=text_length,level=text_level) +``` +这里说明一下,由于demo展示机器没有GPU,所以模型部署采用的是Fastapi部署在后台的。如果demo展示的机器可以直接部署模型,这里可以直接把模型预测的方法写在这里,不需要另外部署模型,再用api的方式调用。这样做有一个值得注意的地方,因为streamlit的代码每一次运行,都是从头到尾执行一遍,就导致模型可能会重复加载,所以这里需要用到st.cache组建,当内容没有更新的时候,会把这一步的结果缓存,而不会重新执行。保证了效率不会因此而下降。 + +### 第六步 结果展示 +```python +with st.spinner('老夫正在思考中🤔...'): + if input_text: + results = generate_qa(input_text,n_sample,model_id=str(model_id), + translator=translator,length=text_length,level=text_level) + for idx,item in enumerate(eval(results),start=1): + st.markdown(f""" + **候选回答「{idx}」:**\n + """) + st.info('中文:%s'%item['fy_next_sentence']) + st.info('英文:%s'%item['next_sentence']) +``` +streamlit对不同格式的内容展示,有丰富的组建,对于文本可以用`st.markdown`组建以及`st.text`和`st.write`展示。更多组建和功能可以参考官方文档:https://docs.streamlit.io + +至此,一个完整的demo展示就完成了。效果图如下: + +![](./image/demo.png) + +完整的代码可以参考:`Fengshenbang-LM/fengshen/examples/FastDemo/YuyuanQA.py` diff --git a/fengshen/examples/FastDemo/YuyuanQA.py b/fengshen/examples/FastDemo/YuyuanQA.py new file mode 100644 index 0000000000000000000000000000000000000000..fed2d19bc61e0735f3868e1a30a532bd19fbb4b0 --- /dev/null +++ b/fengshen/examples/FastDemo/YuyuanQA.py @@ -0,0 +1,71 @@ +import requests +import langid +import streamlit as st +from translate import baiduTranslatorMedical +from translate import baiduTranslator + +langid.set_languages(['en', 'zh']) +lang_dic = {'zh': 'en', 'en': 'zh'} + +st.set_page_config( + page_title="余元医疗问答", + page_icon=":shark:", + # layout="wide", + initial_sidebar_state="expanded", + menu_items={ + 'Get Help': 'https://www.extremelycoolapp.com/help', + 'Report a bug': "https://www.extremelycoolapp.com/bug", + 'About': "# This is a header. This is an *extremely* cool app!" + } +) +st.title('Demo for MedicalQA') + + +st.sidebar.header("参数配置") +sbform = st.sidebar.form("固定参数设置") +n_sample = sbform.slider("设置返回条数", min_value=1, max_value=10, value=3) +text_length = sbform.slider('生成长度:', min_value=32, max_value=512, value=64, step=32) +text_level = sbform.slider('文本多样性:', min_value=0.1, max_value=1.0, value=0.9, step=0.1) +model_id = sbform.number_input('选择模型号:', min_value=0, max_value=13, value=13, step=1) +trans = sbform.selectbox('选择翻译内核', ['百度通用', '医疗生物']) +sbform.form_submit_button("配置") + + +form = st.form("参数设置") +input_text = form.text_input('请输入你的问题:', value='', placeholder='例如:糖尿病的症状有哪些?') +if trans == '百度通用': + translator = 'baidu_common' +else: + translator = 'baidu' +if input_text: + lang = langid.classify(input_text)[0] + if translator == 'baidu': + st.write('**你的问题是:**', baiduTranslatorMedical(input_text, src=lang, dest=lang_dic[lang]).text) + else: + st.write('**你的问题是:**', baiduTranslator(input_text, src=lang, dest=lang_dic[lang]).text) + +form.form_submit_button("提交") + +# @st.cache(suppress_st_warning=True) + + +def generate_qa(input_text, n_sample, model_id='7', length=64, translator='baidu', level=0.7): + # st.write('调用了generate函数') + URL = 'http://192.168.190.63:6605/qa' + data = {"text": input_text, "n_sample": n_sample, "model_id": model_id, + "length": length, 'translator': translator, 'level': level} + r = requests.get(URL, params=data) + return r.text +# my_bar = st.progress(80) + + +with st.spinner('老夫正在思考中🤔...'): + if input_text: + results = generate_qa(input_text, n_sample, model_id=str(model_id), + translator=translator, length=text_length, level=text_level) + for idx, item in enumerate(eval(results), start=1): + st.markdown(f""" + **候选回答「{idx}」:**\n + """) + st.info('中文:%s' % item['fy_next_sentence']) + st.info('英文:%s' % item['next_sentence']) diff --git a/fengshen/examples/FastDemo/image/demo.png b/fengshen/examples/FastDemo/image/demo.png new file mode 100644 index 0000000000000000000000000000000000000000..3eee22e26192861429863058de716e457fc8fc57 Binary files /dev/null and b/fengshen/examples/FastDemo/image/demo.png differ diff --git a/fengshen/examples/GAVAE/generate.py b/fengshen/examples/GAVAE/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..9e6a5693faaf18932d8a8648bb999546619c4cdf --- /dev/null +++ b/fengshen/examples/GAVAE/generate.py @@ -0,0 +1,23 @@ +import torch +from transformers import BertTokenizer,T5Tokenizer +from fengshen.models.GAVAE.GAVAEModel import GAVAEModel +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +encoder_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Randeng-GAVAE-1.2B-Augmentation-Chinese") +decoder_tokenizer = T5Tokenizer.from_pretrained("IDEA-CCNL/Randeng-GAVAE-1.2B-Augmentation-Chinese", eos_token = '<|endoftext|>', pad_token = '',extra_ids=0) +decoder_tokenizer.add_special_tokens({'bos_token':''}) +input_texts = [ + "非常好的一个博物馆,是我所有去过的博物馆里感觉最正规的一家,凭有效证件可以入馆,可以自助免费存小件物品,讲解员和馆内外的工作人员也非常认真,其他的服务人员也很热情,非常好的!馆内的藏品也让人非常震撼!希望继续保持~", + "这是我来长沙最最期待的一定要去的地方,总算今天特地去瞻仰千古遗容了,开车到门口大屏幕显示着门票已发完的字样,心里一惊以为今天是白来了。但进了停车场才知道凭停车卡和有效身份证里面也能领,停车还不花钱,真好。", + "地方很大 很气派~~可以逛很久~~~去的时候是免费的~不过要安检~~~里面的马王堆~幸追夫人~还是很不错的~~~~去的时候有一个吴越文化特别展~~~东西也很多~~~~~很好看", + "我们到达的时候是下午3点,门票已经发完了。当时正焦虑的不知道怎么办才好,门卫大哥给我们俩补办了门票,这才得以入馆。非常感谢!绝对不虚此行!相当震撼的展览!原来古人也化妆,还有假发。记忆最深的是那个藕汤。可惜真颜已不得见。", + "去过三次,个人认为这是长沙最值得去的地方,博物馆的重点就是辛追,遗憾的是,每次去我都会感到悲哀,虽然我三次去的时候都要门票,但是每次看到辛追,都觉得现代的人类不应该挖她出来,除了第一次我觉得辛追像刚死去一样,后来两次我觉得太惨不忍睹了。建议大家要去就早去,以后肯定越来越腐烂", + "上大学时候去的,当时学生证是半价25,后来凭有效证件就不要钱了。非常喜欢的一家博物馆,里面可看的东西很多,当然最吸引我的就是那个辛追夫人和“素纱单衣”,果然不是盖的~里面的讲解员大部分都是师大学历史类的,非常专业和有耐心。虽然不在长沙了,不过对那里还是很有感情的,赞~~~", + "这两年也有很多机会去博物馆。。。不过还是想说湖南省博物馆是非常有特色的。。。应该说整个展览分成两个部分吧。。。一个部分是马王堆的主体展。。。另一个就是湖南的一些考古发现。。。其实来省博大部分的游客还是冲着马王堆来的吧。。。博物馆也很有心的为每一批游客安排了讲解员。。。从马王堆的发现到马王堆出土文物的介绍再到最后棺木和辛追的介绍。。。真是上了一节很生动的历史课。", + "网上订票去的,还是很顺利的就进去了,里面挺清净的,外围的环境也不错,还有鸽子可以喂。那天不是很闹,兜了一圈感觉还是很顺畅的,老娘娘和金缕玉衣挺震撼的。到此一游还是挺需要的", +] +gavae_model = GAVAEModel.from_pretrained("IDEA-CCNL/Randeng-GAVAE-1.2B-Augmentation-Chinese").to(device) +gavae_model.train_gan(encoder_tokenizer,decoder_tokenizer,input_texts) +# n:输出样本数量 +texts = gavae_model.generate(n=5) +print(texts) diff --git a/fengshen/examples/PPVAE/generate.py b/fengshen/examples/PPVAE/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..1bbd369768cf1b903b4edf642836d28dc5a09274 --- /dev/null +++ b/fengshen/examples/PPVAE/generate.py @@ -0,0 +1,24 @@ +import torch +from transformers import BertTokenizer,T5Tokenizer +from fengshen.models.PPVAE.pluginVAE import PPVAEModel +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +encoder_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Randeng-PPVAE-1.2B-Augmentation-Chinese") +decoder_tokenizer = T5Tokenizer.from_pretrained("IDEA-CCNL/Randeng-PPVAE-1.2B-Augmentation-Chinese", eos_token = '<|endoftext|>', pad_token = '',extra_ids=0) +decoder_tokenizer.add_special_tokens({'bos_token':''}) +ppvae_model = PPVAEModel.from_pretrained("IDEA-CCNL/Randeng-PPVAE-1.2B-Augmentation-Chinese").to(device) +input_texts = [ + "非常好的一个博物馆,是我所有去过的博物馆里感觉最正规的一家,凭有效证件可以入馆,可以自助免费存小件物品,讲解员和馆内外的工作人员也非常认真,其他的服务人员也很热情,非常好的!馆内的藏品也让人非常震撼!希望继续保持~", + "这是我来长沙最最期待的一定要去的地方,总算今天特地去瞻仰千古遗容了,开车到门口大屏幕显示着门票已发完的字样,心里一惊以为今天是白来了。但进了停车场才知道凭停车卡和有效身份证里面也能领,停车还不花钱,真好。", + "地方很大 很气派~~可以逛很久~~~去的时候是免费的~不过要安检~~~里面的马王堆~幸追夫人~还是很不错的~~~~去的时候有一个吴越文化特别展~~~东西也很多~~~~~很好看", + "我们到达的时候是下午3点,门票已经发完了。当时正焦虑的不知道怎么办才好,门卫大哥给我们俩补办了门票,这才得以入馆。非常感谢!绝对不虚此行!相当震撼的展览!原来古人也化妆,还有假发。记忆最深的是那个藕汤。可惜真颜已不得见。", + "去过三次,个人认为这是长沙最值得去的地方,博物馆的重点就是辛追,遗憾的是,每次去我都会感到悲哀,虽然我三次去的时候都要门票,但是每次看到辛追,都觉得现代的人类不应该挖她出来,除了第一次我觉得辛追像刚死去一样,后来两次我觉得太惨不忍睹了。建议大家要去就早去,以后肯定越来越腐烂", + "上大学时候去的,当时学生证是半价25,后来凭有效证件就不要钱了。非常喜欢的一家博物馆,里面可看的东西很多,当然最吸引我的就是那个辛追夫人和“素纱单衣”,果然不是盖的~里面的讲解员大部分都是师大学历史类的,非常专业和有耐心。虽然不在长沙了,不过对那里还是很有感情的,赞~~~", + "这两年也有很多机会去博物馆。。。不过还是想说湖南省博物馆是非常有特色的。。。应该说整个展览分成两个部分吧。。。一个部分是马王堆的主体展。。。另一个就是湖南的一些考古发现。。。其实来省博大部分的游客还是冲着马王堆来的吧。。。博物馆也很有心的为每一批游客安排了讲解员。。。从马王堆的发现到马王堆出土文物的介绍再到最后棺木和辛追的介绍。。。真是上了一节很生动的历史课。", + "网上订票去的,还是很顺利的就进去了,里面挺清净的,外围的环境也不错,还有鸽子可以喂。那天不是很闹,兜了一圈感觉还是很顺畅的,老娘娘和金缕玉衣挺震撼的。到此一游还是挺需要的", +] + +ppvae_model.train_plugin(encoder_tokenizer,decoder_tokenizer,input_texts,negative_samples=None) +# n:输出样本数量 +texts = ppvae_model.generate(n=5) +print(texts) \ No newline at end of file diff --git a/fengshen/examples/classification/demo_classification_afqmc_erlangshen_offload.sh b/fengshen/examples/classification/demo_classification_afqmc_erlangshen_offload.sh new file mode 100644 index 0000000000000000000000000000000000000000..f5ff555aa60e3cebd544b92a18443eb7505f8ae8 --- /dev/null +++ b/fengshen/examples/classification/demo_classification_afqmc_erlangshen_offload.sh @@ -0,0 +1,103 @@ +MODEL_NAME="IDEA-CCNL/Erlangshen-MegatronBert-1.3B" + +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + +BATCH_SIZE=1 +VAL_BATCH_SIZE=1 +ZERO_STAGE=3 +config_json="./ds_config.json" + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 1000, + "gradient_clipping": 1, + "zero_optimization": { + "stage": ${ZERO_STAGE}, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9 + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + +DATA_ARGS="\ + --dataset_name IDEA-CCNL/AFQMC \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-5 \ + --weight_decay 1e-1 \ + --warmup_ratio 0.01 \ + --num_labels 2 \ + --model_type huggingface-auto \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 0 \ + --save_weights_only True \ + --dirpath . \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 67 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE}_offload \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 1.0 \ + --precision 16 \ + --default_root_dir . \ + " + +options=" \ + --pretrained_model_path $MODEL_NAME \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +python3 finetune_classification.py $options + diff --git a/fengshen/examples/classification/demo_classification_afqmc_roberta.sh b/fengshen/examples/classification/demo_classification_afqmc_roberta.sh new file mode 100644 index 0000000000000000000000000000000000000000..bad55f2de72f66f02b583d9b191802c55cfe0a4b --- /dev/null +++ b/fengshen/examples/classification/demo_classification_afqmc_roberta.sh @@ -0,0 +1,62 @@ +MODEL_NAME="IDEA-CCNL/Erlangshen-Roberta-110M-NLI" + +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + +BATCH_SIZE=1 +VAL_BATCH_SIZE=1 + +DATA_ARGS="\ + --dataset_name IDEA-CCNL/AFQMC \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-5 \ + --weight_decay 1e-2 \ + --warmup_ratio 0.01 \ + --num_labels 2 \ + --model_type huggingface-auto \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 0 \ + --save_weights_only True \ + --dirpath . \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 67 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy ddp \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 1.0 \ + --precision 16 \ + --default_root_dir . \ + " + +options=" \ + --pretrained_model_path $MODEL_NAME \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +python3 finetune_classification.py $options + diff --git a/fengshen/examples/classification/demo_classification_afqmc_roberta_deepspeed.sh b/fengshen/examples/classification/demo_classification_afqmc_roberta_deepspeed.sh new file mode 100644 index 0000000000000000000000000000000000000000..48b003940a960454912a62731e5aec3b9046a6df --- /dev/null +++ b/fengshen/examples/classification/demo_classification_afqmc_roberta_deepspeed.sh @@ -0,0 +1,90 @@ +MODEL_NAME="IDEA-CCNL/Erlangshen-Roberta-110M-NLI" + +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + +BATCH_SIZE=32 +VAL_BATCH_SIZE=32 +ZERO_STAGE=1 +config_json="./ds_config.json" + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 1000, + "gradient_clipping": 0.1, + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + +DATA_ARGS="\ + --dataset_name IDEA-CCNL/AFQMC \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-5 \ + --weight_decay 1e-2 \ + --warmup_ratio 0.01 \ + --num_labels 2 \ + --model_type huggingface-auto \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 0 \ + --save_weights_only True \ + --dirpath . \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 67 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 1.0 \ + --precision 16 \ + --default_root_dir . \ + " + +options=" \ + --pretrained_model_path $MODEL_NAME \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +python3 finetune_classification.py $options + diff --git a/fengshen/examples/classification/finetune_classification.py b/fengshen/examples/classification/finetune_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..2e643f2fcf560b6c817d22946ad4a6610b647e13 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification.py @@ -0,0 +1,389 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# from fengshen.models.zen1 import ZenModel +from dataclasses import dataclass +from fengshen.models.megatron_t5 import T5EncoderModel +from fengshen.models.roformer import RoFormerModel +from fengshen.models.longformer import LongformerModel +# from fengshen.models.cocolm.modeling_cocolm import COCOLMForSequenceClassification +import numpy as np +import os +from tqdm import tqdm +import json +import torch +import pytorch_lightning as pl +import argparse +from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor +from torch.utils.data import Dataset, DataLoader +from torch.utils.data._utils.collate import default_collate +from transformers import ( + BertModel, + BertConfig, + MegatronBertModel, + MegatronBertConfig, + AutoModel, + AutoConfig, + AutoTokenizer, + AutoModelForSequenceClassification, +) +# os.environ["CUDA_VISIBLE_DEVICES"] = '6' + + +model_dict = {'huggingface-bert': BertModel, + 'fengshen-roformer': RoFormerModel, + 'huggingface-megatron_bert': MegatronBertModel, + 'fengshen-megatron_t5': T5EncoderModel, + 'fengshen-longformer': LongformerModel, + # 'fengshen-zen1': ZenModel, + 'huggingface-auto': AutoModelForSequenceClassification, + } + + +class TaskDataset(Dataset): + def __init__(self, data_path, args, label2id): + super().__init__() + self.args = args + self.label2id = label2id + self.max_length = args.max_length + self.data = self.load_data(data_path, args) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.data[index] + + def load_data(self, data_path, args): + with open(data_path, 'r', encoding='utf8') as f: + lines = f.readlines() + samples = [] + for line in tqdm(lines): + data = json.loads(line) + text_id = int(data[args.id_name] + ) if args.id_name in data.keys() else 0 + texta = data[args.texta_name] if args.texta_name in data.keys( + ) else '' + textb = data[args.textb_name] if args.textb_name in data.keys( + ) else '' + labels = self.label2id[data[args.label_name] + ] if args.label_name in data.keys() else 0 + samples.append({args.texta_name: texta, args.textb_name: textb, + args.label_name: labels, 'id': text_id}) + return samples + + +@dataclass +class TaskCollator: + args = None + tokenizer = None + + def __call__(self, samples): + sample_list = [] + for item in samples: + if item[self.args.texta_name] != '' and item[self.args.textb_name] != '': + if self.args.model_type != 'fengshen-roformer': + encode_dict = self.tokenizer.encode_plus( + [item[self.args.texta_name], item[self.args.textb_name]], + max_length=self.args.max_length, + padding='max_length', + truncation='longest_first') + else: + encode_dict = self.tokenizer.encode_plus( + [item[self.args.texta_name] + + self.tokenizer.eos_token+item[self.args.textb_name]], + max_length=self.args.max_length, + padding='max_length', + truncation='longest_first') + else: + encode_dict = self.tokenizer.encode_plus( + item[self.args.texta_name], + max_length=self.args.max_length, + padding='max_length', + truncation='longest_first') + sample = {} + for k, v in encode_dict.items(): + sample[k] = torch.tensor(v) + sample['labels'] = torch.tensor(item[self.args.label_name]).long() + sample['id'] = item['id'] + sample_list.append(sample) + return default_collate(sample_list) + + +class TaskDataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('TASK NAME DataModel') + parser.add_argument('--data_dir', default='./data', type=str) + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--train_data', default='train.json', type=str) + parser.add_argument('--valid_data', default='dev.json', type=str) + parser.add_argument('--test_data', default='test.json', type=str) + parser.add_argument('--train_batchsize', default=16, type=int) + parser.add_argument('--valid_batchsize', default=32, type=int) + parser.add_argument('--max_length', default=128, type=int) + + parser.add_argument('--texta_name', default='text', type=str) + parser.add_argument('--textb_name', default='sentence2', type=str) + parser.add_argument('--label_name', default='label', type=str) + parser.add_argument('--id_name', default='id', type=str) + + parser.add_argument('--dataset_name', default=None, type=str) + + return parent_args + + def __init__(self, args): + super().__init__() + self.train_batchsize = args.train_batchsize + self.valid_batchsize = args.valid_batchsize + self.tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_path) + self.collator = TaskCollator() + self.collator.args = args + self.collator.tokenizer = self.tokenizer + if args.dataset_name is None: + self.label2id, self.id2label = self.load_schema(os.path.join( + args.data_dir, args.train_data), args) + self.train_data = TaskDataset(os.path.join( + args.data_dir, args.train_data), args, self.label2id) + self.valid_data = TaskDataset(os.path.join( + args.data_dir, args.valid_data), args, self.label2id) + self.test_data = TaskDataset(os.path.join( + args.data_dir, args.test_data), args, self.label2id) + else: + import datasets + ds = datasets.load_dataset(args.dataset_name) + self.train_data = ds['train'] + self.valid_data = ds['validation'] + self.test_data = ds['test'] + self.save_hyperparameters(args) + + def train_dataloader(self): + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False, + collate_fn=self.collator) + + def val_dataloader(self): + return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def predict_dataloader(self): + return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def load_schema(self, data_path, args): + with open(data_path, 'r', encoding='utf8') as f: + lines = f.readlines() + label_list = [] + for line in tqdm(lines): + data = json.loads(line) + labels = data[args.label_name] if args.label_name in data.keys( + ) else 0 + if labels not in label_list: + label_list.append(labels) + + label2id, id2label = {}, {} + for i, k in enumerate(label_list): + label2id[k] = i + id2label[i] = k + return label2id, id2label + + +class taskModel(torch.nn.Module): + def __init__(self, args): + super().__init__() + self.args = args + print('args mode type:', args.model_type) + self.bert_encoder = model_dict[args.model_type].from_pretrained( + args.pretrained_model_path) + self.config = self.bert_encoder.config + self.cls_layer = torch.nn.Linear( + in_features=self.config.hidden_size, out_features=self.args.num_labels) + self.loss_func = torch.nn.CrossEntropyLoss() + + def forward(self, input_ids, attention_mask, token_type_ids, labels=None): + if self.args.model_type == 'fengshen-megatron_t5': + bert_output = self.bert_encoder( + input_ids=input_ids, attention_mask=attention_mask) # (bsz, seq, dim) + encode = bert_output.last_hidden_state[:, 0, :] + else: + bert_output = self.bert_encoder( + input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # (bsz, seq, dim) + encode = bert_output[1] + logits = self.cls_layer(encode) + if labels is not None: + loss = self.loss_func(logits, labels.view(-1,)) + return loss, logits + else: + return 0, logits + + +class LitModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--num_labels', default=2, type=int) + + return parent_args + + def __init__(self, args, num_data): + super().__init__() + self.args = args + self.num_data = num_data + self.model = model_dict[args.model_type].from_pretrained( + args.pretrained_model_path) + self.save_hyperparameters(args) + + def setup(self, stage) -> None: + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def training_step(self, batch, batch_idx): + del batch['id'] + output = self.model(**batch) + loss, logits = output[0], output[1] + acc = self.comput_metrix(logits, batch['labels']) + self.log('train_loss', loss) + self.log('train_acc', acc) + return loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + del batch['id'] + output = self.model(**batch) + loss, logits = output[0], output[1] + acc = self.comput_metrix(logits, batch['labels']) + self.log('val_loss', loss) + self.log('val_acc', acc, sync_dist=True) + + def predict_step(self, batch, batch_idx): + ids = batch['id'] + del batch['id'] + output = self.model(**batch) + return {ids, output.logits} + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + +class TaskModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./log/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + every_n_epochs=1, + filename=args.filename) + + +def save_test(data, args, data_model, rank): + file_name = args.output_save_path + f'.{rank}' + with open(file_name, 'w', encoding='utf-8') as f: + idx = 0 + for i in range(len(data)): + ids, batch = data[i] + for id, sample in zip(ids, batch): + tmp_result = dict() + label_id = np.argmax(sample.cpu().numpy()) + tmp_result['id'] = id.item() + tmp_result['label'] = data_model.id2label[label_id] + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data+'\n') + idx += 1 + print('save the result to '+file_name) + + +def main(): + pl.seed_everything(42) + + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser.add_argument('--pretrained_model_path', default='', type=str) + total_parser.add_argument('--output_save_path', + default='./predict.json', type=str) + total_parser.add_argument('--model_type', + default='huggingface-bert', type=str) + + # * Args for data preprocessing + total_parser = TaskDataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = pl.Trainer.add_argparse_args(total_parser) + total_parser = TaskModelCheckpoint.add_argparse_args(total_parser) + + # * Args for base model + from fengshen.models.model_utils import add_module_args + total_parser = add_module_args(total_parser) + total_parser = LitModel.add_model_specific_args(total_parser) + + args = total_parser.parse_args() + print(args.pretrained_model_path) + + checkpoint_callback = TaskModelCheckpoint(args).callbacks + early_stop_callback = EarlyStopping( + monitor="val_acc", min_delta=0.00, patience=5, verbose=False, mode="max") + lr_monitor = LearningRateMonitor(logging_interval='step') + trainer = pl.Trainer.from_argparse_args(args, + callbacks=[ + checkpoint_callback, + lr_monitor, + early_stop_callback] + ) + + data_model = TaskDataModel(args) + model = LitModel(args, len(data_model.train_dataloader())) + + trainer.fit(model, data_model) + result = trainer.predict( + model, data_model, ckpt_path=trainer.checkpoint_callback.best_model_path) + save_test(result, args, data_model, trainer.global_rank) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/classification/finetune_classification.sh b/fengshen/examples/classification/finetune_classification.sh new file mode 100644 index 0000000000000000000000000000000000000000..993071ceb0ceeb44c0bf887abcdbc0c9f982c4d5 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification.sh @@ -0,0 +1,75 @@ +#!/bin/bash +#SBATCH --job-name=slurm-test # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=2 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=16G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + + + +MODEL_TYPE=fengshen-roformer +PRETRAINED_MODEL_PATH=IDEA-CCNL/Zhouwenwang-Unified-110M + +ROOT_PATH=cognitive_comp +TASK=tnews + +DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/modelevaluation/tnews/ +OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test1.1.json \ + --train_batchsize 32 \ + --valid_batchsize 128 \ + --max_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + " + +MODEL_ARGS="\ + --learning_rate 0.00002 \ + --weight_decay 0.1 \ + --num_labels 15 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir ./log/ \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + --model_type $MODEL_TYPE \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif +SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py + +python3 $SCRIPT_PATH $options +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bart-base_afqmc.sh b/fengshen/examples/classification/finetune_classification_bart-base_afqmc.sh new file mode 100644 index 0000000000000000000000000000000000000000..2700d2ad3d6fca47238db033781905ac372b183a --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bart-base_afqmc.sh @@ -0,0 +1,143 @@ +#!/bin/bash +#SBATCH --job-name=afqmc-bart-base # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/cache/torch_extendsions + +MODEL_NAME=bart-base + +TASK=afqmc +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=8 +VAL_BATCH_SIZE=32 +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/gaoxinyu/pretrained_model/$MODEL_NAME/ + + +CHECKPOINT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/ckpt/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK} +OUTPUT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK}/predict.json + + +config_json="./ds_config.${MODEL_NAME}.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 0.1, + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-7, + "eps": 1e-12, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 1e-5, + "warmup_max_lr": 1e-4, + "warmup_num_steps": 400, + "warmup_type": "linear" + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 64 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-6 \ + --weight_decay 1e-2 \ + --warmup 0.01 \ + --num_labels 2 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 67 \ + --gpus 2 \ + --num_nodes 1 \ + --strategy $STRATEGY \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 1.0 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif +SCRIPT_PATH=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bart-base_ocnli.sh b/fengshen/examples/classification/finetune_classification_bart-base_ocnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..6ef4886993eb2c1c8938180c940ece9bb156b73f --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bart-base_ocnli.sh @@ -0,0 +1,143 @@ +#!/bin/bash +#SBATCH --job-name=ocnli-bart-base # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/cache/torch_extendsions + +MODEL_NAME=bart-base + +TASK=ocnli +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=8 +VAL_BATCH_SIZE=32 +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/gaoxinyu/pretrained_model/$MODEL_NAME/ + + +CHECKPOINT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/ckpt/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK} +OUTPUT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/${MODEL_NAME}-${TASK}/predict.json + + +config_json="./ds_config.${MODEL_NAME}.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 0.1, + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-7, + "eps": 1e-12, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 1e-8, + "warmup_max_lr": 1e-6, + "warmup_num_steps": 400, + "warmup_type": "linear" + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-6 \ + --weight_decay 1e-2 \ + --warmup 0.01 \ + --num_labels 3 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 67 \ + --gpus 2 \ + --num_nodes 1 \ + --strategy $STRATEGY \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 1.0 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif +SCRIPT_PATH=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_afqmc.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_afqmc.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d36b627d6cc1b0a8de575138eec6a7529b31137 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_afqmc.sh @@ -0,0 +1,146 @@ +#!/bin/bash +#SBATCH --job-name=afqmc # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=4 # total number of tasks across all nodes +#SBATCH --cpus-per-task=20 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:4 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + +set -x -e +echo "START TIME: $(date)" + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/cache/torch_extendsions + +BERT_NAME=bert-3.9B + +TASK=afqmc +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=8 +VAL_BATCH_SIZE=32 +ZERO_STAGE=2 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/gaoxinyu/pretrained_model/$BERT_NAME/ + + +CHECKPOINT_PATH=/cognitive_comp/gaoxinyu/ln_model/fintune/ckpt/fengshen-finetune/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/gaoxinyu/ln_model/finetune/${BERT_NAME}-${TASK} +OUTPUT_PATH=/cognitive_comp/gaoxinyu/ln_model/finetune/${BERT_NAME}-${TASK}/predict.json + + +config_json="./ds_config.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 1000, + "gradient_clipping": 0.1, + "zero_optimization": { + "stage": 2 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-7, + "eps": 1e-12, + "weight_decay": 1e-1 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 1e-8, + "warmup_max_lr": 1e-6, + "warmup_num_steps": 400, + "warmup_type": "linear" + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-5 \ + --weight_decay 1e-2 \ + --warmup 0.01 \ + --num_labels 2 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 0 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 67 \ + --gpus 4 \ + --num_nodes 1 \ + --strategy $STRATEGY \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --precision 16 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif +SCRIPT_PATH=/cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun -N 1 --job-name=afqmc --jobid=151522 --ntasks=4 --cpus-per-task=15 --gres=gpu:4 -o %x-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_cmnli.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_cmnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..da10752cff77be9462d17cbb45882543a5e0ed48 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_cmnli.sh @@ -0,0 +1,161 @@ +#!/bin/bash +#SBATCH --job-name=slurm-test # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions + +BERT_NAME=bert-3.9B + +TASK=cmnli +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=16 +VAL_BATCH_SIZE=56 +ZERO_STAGE=2 + + +ROOT_PATH=cognitive_comp +DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/ + + +CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/fengshen/fengshen/scripts/log/$TASK/$BERT_NAME/ +OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json + + +config_json="./ds_config.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": 6553600, + "stage3_prefetch_bucket_size": 5898240, + "stage3_param_persistence_threshold": 25600, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-3 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-8, + "warmup_max_lr": 1e-6 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 0.000001 \ + --weight_decay 0.001 \ + --warmup 0.001 \ + --num_labels 3 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 2 \ + --strategy deepspeed_stage_3 \ + --precision 16 \ + --gradient_clip_val 0.1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif +SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_iflytek.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_iflytek.sh new file mode 100644 index 0000000000000000000000000000000000000000..13e08efc318a60eabec72cd4357f8aa9dd558f44 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_iflytek.sh @@ -0,0 +1,158 @@ +#!/bin/bash +#SBATCH --job-name=slurm-test # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions + +BERT_NAME=bert-3.9B + +TASK=iflytek +TEXTA_NAME=sentence +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=16 +VAL_BATCH_SIZE=56 +ZERO_STAGE=2 + + +ROOT_PATH=cognitive_comp +DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/ + + +CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/Fengshenbang-LM/fengshen/scripts/log/$TASK +OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json + + +config_json="./ds_config.$SLURM_JOBID.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": 6553600, + "stage3_prefetch_bucket_size": 5898240, + "stage3_param_persistence_threshold": 25600, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-6, + "warmup_max_lr": 1e-5 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 0.00001 \ + --weight_decay 0.01 \ + --warmup 0.001 \ + --num_labels 119 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 2 \ + --strategy deepspeed_stage_3 \ + --precision 16 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif +SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_ocnli.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_ocnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..8d3107931f88671d54d50325b8d469a12ee4e224 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_ocnli.sh @@ -0,0 +1,163 @@ +#!/bin/bash +#SBATCH --job-name=slurm-test # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions + +BERT_NAME=bert-1.3B + +TASK=ocnli +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=16 +VAL_BATCH_SIZE=56 +ZERO_STAGE=2 + + +ROOT_PATH=cognitive_comp +DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/ + + +CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/fengshen/fengshen/scripts/log/$TASK/$BERT_NAME +OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json + + +config_json="./ds_config.$SLURM_JOBID.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 0.1, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": 6553600, + "stage3_prefetch_bucket_size": 5898240, + "stage3_param_persistence_threshold": 25600, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-6, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-6 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-8, + "warmup_max_lr": 1e-6, + "warmup_num_steps": 400, + "warmup_type": "linear" + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 0.000001 \ + --weight_decay 0.001 \ + --warmup 0.001 \ + --num_labels 3 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 2 \ + --strategy deepspeed_stage_3 \ + --precision 16 \ + --gradient_clip_val 0.1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif +SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_tnews.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_tnews.sh new file mode 100644 index 0000000000000000000000000000000000000000..62a2349bd4ce90d20f9747fd570cb070ea60be2f --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_tnews.sh @@ -0,0 +1,161 @@ +#!/bin/bash +#SBATCH --job-name=slurm-test # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=4 # total number of tasks across all nodes +#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:4 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions + +BERT_NAME=bert-3.9B + +TASK=tnews +TEXTA_NAME=sentence +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=16 +VAL_BATCH_SIZE=56 +ZERO_STAGE=2 + + +ROOT_PATH=cognitive_comp +DATA_DIR=/$ROOT_PATH/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/ + + +CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/fengshen/fengshen/scripts/log/$TASK/$BERT_NAME/nograd +OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json + + +config_json="./ds_config.$SLURM_JOBID.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": 6553600, + "stage3_prefetch_bucket_size": 5898240, + "stage3_param_persistence_threshold": 25600, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-8, + "warmup_max_lr": 1e-5, + "warmup_num_steps": 400, + "warmup_type": "linear" + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 0.00001 \ + --weight_decay 0.01 \ + --warmup 0.001 \ + --num_labels 15 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 4 \ + --strategy deepspeed_stage_3 \ + --precision 16 \ + --gradient_clip_val 0.1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif +SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_bert-3.9B_wsc.sh b/fengshen/examples/classification/finetune_classification_bert-3.9B_wsc.sh new file mode 100644 index 0000000000000000000000000000000000000000..5d05662f1a2252de3bbd4fd9719ef8d3262d9c63 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_bert-3.9B_wsc.sh @@ -0,0 +1,158 @@ +#!/bin/bash +#SBATCH --job-name=slurm-test # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=16 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=8G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/yangping/cache/torch_extendsions + +BERT_NAME=bert-3.9B + +TASK=wsc +TEXTA_NAME=texta +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=16 +VAL_BATCH_SIZE=56 +ZERO_STAGE=2 + + +ROOT_PATH=cognitive_comp +DATA_DIR=/cognitive_comp/yangping/data/unidata/multichoice/mrc_multichoice_data/other/cluewsc2020/ +PRETRAINED_MODEL_PATH=/$ROOT_PATH/yangping/pretrained_model/$BERT_NAME/ + + +CHECKPOINT_PATH=/$ROOT_PATH/yangping/checkpoints/fengshen-finetune/$TASK/ +DEFAULT_ROOT_DIR=/cognitive_comp/yangping/nlp/Fengshenbang-LM/fengshen/scripts/log/$TASK +OUTPUT_PATH=/$ROOT_PATH/yangping/nlp/modelevaluation/output/${TASK}_predict.json + + +config_json="./ds_config.$SLURM_JOBID.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": 6553600, + "stage3_prefetch_bucket_size": 5898240, + "stage3_param_persistence_threshold": 25600, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_fp16_weights_on_model_save": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-6, + "warmup_max_lr": 1e-5 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 0.00001 \ + --weight_decay 0.01 \ + --warmup 0.001 \ + --num_labels 2 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 10 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 2 \ + --strategy deepspeed_stage_3 \ + --precision 16 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 10 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +DOCKER_PATH=/$ROOT_PATH/yangping/containers/pytorch21_06_py3_docker_image.sif +SCRIPT_PATH=/$ROOT_PATH/yangping/nlp/fengshen/fengshen/examples/finetune_classification.py + +# python3 $SCRIPT_PATH $options +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $DOCKER_PATH python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_zen1-base_afqmc.sh b/fengshen/examples/classification/finetune_classification_zen1-base_afqmc.sh new file mode 100644 index 0000000000000000000000000000000000000000..845e93093cc6390db2c332c22e860ff88688a657 --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_zen1-base_afqmc.sh @@ -0,0 +1,151 @@ +#!/bin/bash +#SBATCH --job-name=afqmc-bart-base # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=fengshen-zen1 + +TASK=afqmc +TEXTA_NAME=sentence1 +TEXTB_NAME=sentence2 +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=8 +VAL_BATCH_SIZE=32 +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + + +config_json="${ROOT_DIR}/ds_config.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 0.1, + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-7, + "eps": 1e-12, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 1e-5, + "warmup_max_lr": 1e-4, + "warmup_num_steps": 400, + "warmup_type": "linear" + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 64 \ + --texta_name $TEXTA_NAME \ + --textb_name $TEXTB_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-5 \ + --weight_decay 1e-2 \ + --warmup 0.01 \ + --num_labels 2 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy $STRATEGY \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 1.0 \ + --default_root_dir $ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py + +# python3 $SCRIPT_PATH $options +source activate base +# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/finetune_classification_zen1-base_tnews.sh b/fengshen/examples/classification/finetune_classification_zen1-base_tnews.sh new file mode 100644 index 0000000000000000000000000000000000000000..eaa50ddac4376c8e86000852da138d0d4779126d --- /dev/null +++ b/fengshen/examples/classification/finetune_classification_zen1-base_tnews.sh @@ -0,0 +1,150 @@ +#!/bin/bash +#SBATCH --job-name=afqmc-bart-base # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + +export CUDA_VISIBLE_DEVICES='5' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=fengshen-zen1 + +TASK=tnews +TEXTA_NAME=sentence +LABEL_NAME=label +ID_NAME=id + + +BATCH_SIZE=8 +VAL_BATCH_SIZE=32 +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + + +config_json="${ROOT_DIR}/ds_config.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +# reduce_bucket_size: hidden_size*hidden_size +# stage3_prefetch_bucket_size: 0.9 * hidden_size * hidden_size +# stage3_param_persistence_threshold: 10 * hidden_size + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $BATCH_SIZE, + "steps_per_print": 100, + "gradient_clipping": 0.1, + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 2e-5, + "eps": 1e-12, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 2e-8, + "warmup_max_lr": 2e-5, + "warmup_num_steps": 400, + "warmup_type": "linear" + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test1.1.json \ + --train_batchsize $BATCH_SIZE \ + --valid_batchsize $VAL_BATCH_SIZE \ + --max_length 128 \ + --texta_name $TEXTA_NAME \ + --label_name $LABEL_NAME \ + --id_name $ID_NAME \ + " + +MODEL_ARGS="\ + --learning_rate 1e-5 \ + --weight_decay 1e-2 \ + --warmup 0.01 \ + --num_labels 15 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + + +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy $STRATEGY \ + --gradient_clip_val 1.0 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 1.0 \ + --default_root_dir $ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_save_path $OUTPUT_PATH \ + --model_type $MODEL_NAME \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/classification/finetune_classification.py + +# python3 $SCRIPT_PATH $options +source activate base +singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/classification/readme.md b/fengshen/examples/classification/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..b90ce5a946acf55a6530b3c8d010a5ec2642f6ae --- /dev/null +++ b/fengshen/examples/classification/readme.md @@ -0,0 +1,23 @@ +## 分类下游任务 + +在当前目录下,我们提供丰富的分类任务的示例,其中我们提供三个一键式运行的示例。 + +- demo_classification_afqmc_roberta.sh 使用DDP微调roberta +- demo_classification_afqmc_roberta_deepspeed.sh 结合deepspeed微调roberta,获得更快的运算速度 +- demo_classification_afqmc_erlangshen_offload.sh 仅需7G显存即可微调我们效果最好的二郎神系列模型 + +上述示例均采用AFQMC的数据集,关于数据集的介绍可以在[这里](https://www.cluebenchmarks.com/introduce.html)找到。 +同时我们处理过的数据文件已经放在Huggingface上,点击[这里](https://huggingface.co./datasets/IDEA-CCNL/AFQMC)直达源文件。 +仅需要按我们的格式稍微处理一下数据集,即可适配下游不同的分类任务。 +在脚本示例中,仅需要修改如下参数即可适配本地文件 +``` + --dataset_name IDEA-CCNL/AFQMC \ + +-------> 修改为 + + --data_dir $DATA_DIR \ # 数据目录 + --train_data train.json \ # 数据文件 + --valid_data dev.json \ + --test_data test.json \ + +``` \ No newline at end of file diff --git a/fengshen/examples/clip_finetune/clip_finetune_flickr.py b/fengshen/examples/clip_finetune/clip_finetune_flickr.py new file mode 100644 index 0000000000000000000000000000000000000000..9cac74d87e861cf0ffff64c9ca03330208db90c3 --- /dev/null +++ b/fengshen/examples/clip_finetune/clip_finetune_flickr.py @@ -0,0 +1,259 @@ +import sys +sys.path.append('../../') +from data.clip_dataloader.flickr import FlickrDataModule +import pytorch_lightning as pl +import numpy as np +import torch +from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts +import torch.nn.functional as F +import math +import copy +import argparse +from transformers import CLIPModel, BertForSequenceClassification + +class CLIPLightning(pl.LightningModule): + def __init__(self, model_name='ViT-B/32', minibatch_size=2): + """A lightning wrapper for a CLIP model as specified in the paper. + + Args: + model_name (str): A case sensitive visual model name. + config (dict): A dictionary containing the CLIP instantiation parameters. + """ + super().__init__() + + self.prepare_data_per_node = True + self.model_name = 'ViT-B/32' + # self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") # NOTE load from openAI + self.text_encoder = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-102M-Chinese") + self.minibatch_size = minibatch_size + self.isViT = 'ViT' in self.model_name + self.automatic_optimization = False + + # Training loss: https://github.com/openai/CLIP/issues/83 + # Mini-batching thanks to https://github.com/crowsonkb / https://twitter.com/RiversHaveWings + # Multi-GPU support: https://github.com/MicPie/clasp + + def training_step(self, train_batch, idx): + # get optimizers and scheduler + optimizer = self.optimizers() + + image, text, labels = train_batch + n = math.ceil(len(image) // self.minibatch_size) + image_mbs = torch.chunk(image, n) + text_mbs = torch.chunk(text, n) + + with torch.no_grad(): + ims = [F.normalize(self.clip_model.get_image_features(im), dim=1) for im in image_mbs] + txt = [F.normalize(self.text_encoder(t).logits, dim=1) for t in text_mbs] + # gather from all GPUs 这里的LOSS要把所有GPU的汇集起来一起算才对 + ims = self.all_gather(torch.cat(ims)) + txt = self.all_gather(torch.cat(txt)) + + if len(ims.shape) == 3: + ims = list(ims) + txt = list(txt) + else: + ims = [ims] + txt = [txt] + + image_logits = torch.cat(ims) @ torch.cat(txt).t() * self.clip_model.logit_scale.exp() + ground_truth = torch.arange(len(image_logits)).long().to(image_logits.device) + loss = (F.cross_entropy(image_logits, ground_truth) + + F.cross_entropy(image_logits.t(), ground_truth)).div(2) + acc_i = (torch.argmax(image_logits, 1) == ground_truth).sum() + acc_t = (torch.argmax(image_logits, 0) == ground_truth).sum() + self.log_dict({'loss': loss / len(ims), 'acc': (acc_i + acc_t) / 2 / len(image) / len(ims)}, prog_bar=True) + + if isinstance(optimizer, list): + optimizer = optimizer[0] + optimizer.zero_grad() + + # image loss + for j, mb in enumerate(image_mbs[:-1]): + # 最后一部分样本舍弃。(对齐的bug) + images_tmp = copy.deepcopy(ims) + images_tmp[self.global_rank][j * self.minibatch_size:(j+1)*self.minibatch_size] = \ + F.normalize(self.clip_model.get_image_features(mb), dim=1) + image_logits = torch.cat(images_tmp) @ torch.cat(txt).t() * self.clip_model.logit_scale.exp() + ground_truth = torch.arange(len(image_logits)).long().to(image_logits.device) + loss = (F.cross_entropy(image_logits, ground_truth) + F.cross_entropy(image_logits.t(), ground_truth))/2 + self.manual_backward(loss) + + # text loss + for j, mb in enumerate(text_mbs[:-1]): + text_tmp = copy.deepcopy(txt) + text_tmp[self.global_rank][j * self.minibatch_size:(j+1)*self.minibatch_size] = \ + F.normalize(self.text_encoder(mb).logits, dim=1) + image_logits = torch.cat(ims) @ torch.cat(text_tmp).t() * self.clip_model.logit_scale.exp() + loss = (F.cross_entropy(image_logits, ground_truth) + F.cross_entropy(image_logits.t(), ground_truth))/2 + self.manual_backward(loss) + + optimizer.step() + lr_scheduler = self.lr_schedulers() + lr_scheduler.step() + self.clip_model.logit_scale.data.clamp_(-np.log(100), np.log(100)) + + def validation_step(self, val_batch, idx): + image, text, labels = val_batch + img_embed = self.clip_model.get_image_features(image) + txt_embed = self.text_encoder(text).logits + # print(img_embed.shape) + image_norm = F.normalize(img_embed, dim=1) + text_norm = F.normalize(txt_embed, dim=1) + image_logits = image_norm @ text_norm.t() * self.clip_model.logit_scale.exp() + text_logits = text_norm @ image_norm.t() * self.clip_model.logit_scale.exp() + # print(image_logits.shape) + # image_logits, text_logits = self.forward(image, text) + ground_truth = torch.arange(len(image_logits)).long().to(image_logits.device) + loss = (F.cross_entropy(image_logits, ground_truth) + F.cross_entropy(text_logits, ground_truth)).div(2) + self.log('val_loss', loss, prog_bar=True) + return [image_norm, text_norm, labels] + + def validation_epoch_end(self, outputs): + image_features = torch.cat([x[0] for x in outputs]) + text_features = torch.cat([x[1] for x in outputs]) + labels = [label for x in outputs for label in x[2]] + print(image_features.shape, text_features.shape, len(labels)) + self.get_metrics(image_features, text_features, labels, 100) + + def test_step(self, test_batch, idx): + image, text, labels = test_batch + image_features = self.clip_model.get_image_features(image) + text_features = self.text_encoder(text).logits + image_features = image_features / image_features.norm(dim=1, keepdim=True) + text_features = text_features / text_features.norm(dim=1, keepdim=True) + return [image_features, text_features, labels] + + def test_epoch_end(self, outputs): + image_features = torch.cat([x[0] for x in outputs]) + text_features = torch.cat([x[1] for x in outputs]) + labels = [label for x in outputs for label in x[2]] + print(image_features.shape, text_features.shape, len(labels)) + self.get_metrics(image_features, text_features, labels, 100) + + def get_metrics(self, image_features, text_features, labels, logit_scale): + # 计算相似度,支持多个样本的情况(比如一个图片有多个caption) + # img2txt计算的时候要用到,因为一张图片可能对应多个文本。 + # txt2img计算的时候不需要(一般一个text只有一个对应图片) + # metrics = {} + logits_per_image = (logit_scale * image_features @ text_features.t()).detach().cpu() + logits_per_text = logits_per_image.t().detach().cpu() + + logits = {"image_to_text": logits_per_image, "text_to_image": logits_per_text} + + label2idx = {} # 计算label到idx的映射。 + repeat_id = [] + for i, label in enumerate(labels): + if label not in label2idx: + label2idx[label] = [i] + else: + # 表示该index的标签出现过,记录这个index,后续算txt2img分数的时候,这些index的权值要降低。 + label2idx[label].append(i) + repeat_id.append(i) + # print(label2idx) # 标注了每个label的idx + + # print('repeat_id:', repeat_id) + ground_truth = [label2idx[label] for label in labels] + # print(ground_truth) + + for name, logit in logits.items(): + # print(name, logit.shape) + if name == 'text_to_image': + logit[:, repeat_id] -= 1e8 # 这部分的分数要降低。(重复出现的图片,直接忽略) + r1_stat, r5_stat, r10_stat = [], [], [] + ranking = torch.argsort(logit, descending=True) # index of the largest element to the smallest + # print(name, ranking[:, :10]) + for i, each_query in enumerate(ranking[:, :10]): + for j, q in enumerate(each_query): + if q in ground_truth[i]: + if j == 0: + r1_stat.append(1) + r5_stat.append(1) + r10_stat.append(1) + break + if j < 5: + r5_stat.append(1) + r10_stat.append(1) + break + if j < 10: + r10_stat.append(1) + break + print(f'{name} r1:{sum(r1_stat)/len(logit)}, r5:{sum(r5_stat)/len(logit)}, r10:{sum(r10_stat)/len(logit)}') + + def configure_optimizers(self): + lr = { + "RN50": 5e-4, + "RN101": 5e-4, + "RN50x4": 5e-4, + "RN50x16": 4e-4, + "RN50x64": 3.6e-4, + "ViT-B/32": 5e-4, + "ViT-B/16": 5e-4, + "ViT-L/14": 4e-4, + "ViT-L/14-336px": 2e-5 + }[self.model_name] + + optimizer = torch.optim.AdamW( + [{'params': self.clip_model.parameters()}, {'params': self.text_encoder.parameters()}], + lr=lr, + betas=( + 0.9, + 0.98 if self.isViT else 0.999 + ), + eps=1e-6 if self.isViT else 1e-8, + weight_decay=0.2 + ) + + # Source: https://github.com/openai/CLIP/issues/107 + # Use pip install 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup' + lr_scheduler = CosineAnnealingWarmRestarts( + optimizer, + T_0=2000 + ) + # CosineAnnealingWarmupRestarts + return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler} + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + # model_name + parser.add_argument('--model', type=str, + default="ViT-B/32", + help='model definition') + + # experiment setting + parser.add_argument('--batch_size', type=int, default=128) + parser.add_argument('--num_epoches', type=int, default=1) + parser.add_argument('--num_gpus', type=int, default=2) + + # dataset + parser.add_argument('--train_filename', type=str, + help='dir or csv file') + parser.add_argument('--train_root', type=str, + help='image root path') + parser.add_argument('--val_filename', type=str, + help='dir or csv file') + parser.add_argument('--val_root', type=str, + help='image root path') + parser.add_argument('--test_filename', type=str, + help='dir or csv file') + parser.add_argument('--test_root', type=str, + help='image root path') + parser.add_argument('--num_workers', type=int, default=0) + + # huggingface pretrain model 定义 + parser.add_argument('--pretrain_model', type=str, + default="openai/clip-vit-base-patch32", + help='defalut load from openai') # "wf-genius/TaiYi-CLIP-ViT-B-32" 是我训好的 NOTE + + args = parser.parse_args() + dm = FlickrDataModule(args) + + model = CLIPLightning(model_name=args.model, minibatch_size=args.batch_size//2) + trainer = pl.Trainer(gpus=args.num_gpus, precision=16, max_epochs=args.num_epoches) + trainer.test(model, dm) # zero-shot test + trainer.fit(model, dm) # finetune on train set + trainer.test(model, dm) # test again + diff --git a/fengshen/examples/clip_finetune/finetune_flickr.sh b/fengshen/examples/clip_finetune/finetune_flickr.sh new file mode 100644 index 0000000000000000000000000000000000000000..0e8f8c79decdbd4a070188fbfa976bd4b90d0d8d --- /dev/null +++ b/fengshen/examples/clip_finetune/finetune_flickr.sh @@ -0,0 +1,10 @@ +python clip_finetune_flickr.py --batch_size 512 \ +--num_gpus 1 \ +--num_workers 20 \ +--train_filename /shared_space/ccnl/mm_data/Flickr30k-CNA/train/flickr30k_cna_train.txt \ +--val_filename /shared_space/ccnl/mm_data/Flickr30k-CNA/val/flickr30k_cna_val.txt \ +--test_filename /shared_space/ccnl/mm_data/Flickr30k-CNA/test/flickr30k_cn_test.txt \ +--train_root /shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images \ +--val_root /shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images \ +--test_root /shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images \ + diff --git a/fengshen/examples/clue1.1/README.md b/fengshen/examples/clue1.1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..63856c5a596db8f968a7dcebcc03d85ff8c3a49f --- /dev/null +++ b/fengshen/examples/clue1.1/README.md @@ -0,0 +1,48 @@ +# 中文 NLP 权威测评基准 CLUE 刷榜 Top10 方案指南 + + [CLUE](https://www.cluebenchmarks.com) 是中文 NLP 的权威测评榜单,也吸引了许多国内许多团队在上面进行测评。在我们的最新模型 UniMC 中,也使用 CLUE 对我们的模型进行了测评。在全量数据榜单 CLUE1.1 中,我们的 [UniMC-DeBERTa-1.4B](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-DeBERTa-v2-1.4B-Chinese) 模型取得了第 8 的成绩,是 [CLUE1.1](https://www.cluebenchmarks.com/rank.html) 排行榜(2022年11月14日)前 10 名中唯一开源模型权重和刷榜代码的模型。 + +## 刷榜方案 + +通过观察可以发现,在CLUE需要测评的 9 个任务中,有 8 个是分类任务,只有一个 cmrc2018 是抽取式的阅读理解任务。因此,结合我们的 Fengshenbang-LM 已有的模型,我们可以使用 [UniMC](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/dev/yangping/fengshen/examples/unimc) 来实现 8 个是分类任务,用 [Ubert](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/dev/yangping/fengshen/examples/ubert) 来实现 cmrc2018 任务,详细的方案可以看我们的知乎文章:https://zhuanlan.zhihu.com/p/583679722 + +## 项目要求 + +安装我们的 fengshen 框架,我们暂且提供如下方式安装 +```shell +git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git +cd Fengshenbang-LM +pip install --editable ./ +``` +## 运行项目 + +### 数据下载 +由于 HuggingFace 上的数据与最终提交的数据 id 有可能对应不上,所以建议还是去官方仓库进行下载 +https://github.com/CLUEBENCHMARK/CLUE + + +### 数据预处理 +将数据下载之后,修改下面脚本的路径,运行下面脚本将数据处理成 UniMC 模型 和 Ubert 模型所需要的格式 +```shell +sh cluedata2unidata.sh +``` + +### 模型训练 +训练CLUE上的8个分类任务,一些训练参数可根据自己的设备进行修改。对于全量数据来说,训练超参数没有那么大的影响 +```shell +sh run_clue_unimc.sh +``` +训练 cmrc2018 任务,一些训练参数可根据自己的设备进行修改 +```shell +sh run_clue_ubert.sh +``` + +### 预测结果提交 + +运行下面脚本将预测结果转化为 CLUE 要求的格式,数据路径需要根据自己的路径修改调整。运行下面脚本就可以得到结果,然后拿到 [CLUE](https://www.cluebenchmarks.com/index.html) 官网上去提交了 + +```shell +sh predict2submit.sh +``` + + diff --git a/fengshen/examples/clue1.1/cluedata2unidata.sh b/fengshen/examples/clue1.1/cluedata2unidata.sh new file mode 100644 index 0000000000000000000000000000000000000000..d838604a8b3d39ab90b277f2a467d6d087e7bd54 --- /dev/null +++ b/fengshen/examples/clue1.1/cluedata2unidata.sh @@ -0,0 +1,15 @@ + +CLUEDATA_PATH=./CLUE_DATA #CLUE 原始数据路径 +UNIDATA_PATH=./data #处理数据输出路 + +SCRIPT_PATH=./data_preprocessing + +python $SCRIPT_PATH/afqmc_preprocessing.py --data_path=$CLUEDATA_PATH/afqmc_public --save_path=$UNIDATA_PATH/afqmc +python $SCRIPT_PATH/c3_preprocessing.py --data_path=$CLUEDATA_PATH/c3_public --save_path=$UNIDATA_PATH/c3 +python $SCRIPT_PATH/chid_preprocessing.py --data_path=$CLUEDATA_PATH/chid_public --save_path=$UNIDATA_PATH/chid +python $SCRIPT_PATH/csl_preprocessing.py --data_path=$CLUEDATA_PATH/csl_public --save_path=$UNIDATA_PATH/csl +python $SCRIPT_PATH/iflytek_preprocessing.py --data_path=$CLUEDATA_PATH/iflytek_public --save_path=$UNIDATA_PATH/iflytek +python $SCRIPT_PATH/ocnli_preprocessing.py --data_path=$CLUEDATA_PATH/ocnli_public --save_path=$UNIDATA_PATH/ocnli +python $SCRIPT_PATH/tnews_preprocessing.py --data_path=$CLUEDATA_PATH/tnews_public --save_path=$UNIDATA_PATH/tnews +python $SCRIPT_PATH/wsc_preprocessing.py --data_path=$CLUEDATA_PATH/cluewsc2020_public --save_path=$UNIDATA_PATH/wsc +python $SCRIPT_PATH/cmrc2018_preprocessing.py --data_path=$CLUEDATA_PATH/cmrc2018_public --save_path=$UNIDATA_PATH/cmrc2018 diff --git a/fengshen/examples/clue1.1/data_preprocessing/afqmc_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/afqmc_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..9297199bc6f0e0972ec508876680a321ee8a4165 --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/afqmc_preprocessing.py @@ -0,0 +1,59 @@ +import json +from tqdm import tqdm +import os +import argparse + +label2desc={"0": "不相似", "1": "相似"} + +def load_data(file_path,is_training=False): + with open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + result=[] + for line in tqdm(lines): + data = json.loads(line) + texta = data['sentence1'] + textb = data['sentence2'] + question = '' + choice = [v for k,v in label2desc.items()] + answer = label2desc[data['label']] if 'label' in data.keys() else '' + label = choice.index(answer) if 'label' in data.keys() else 0 + text_id = data['id'] if 'id' in data.keys() else 0 + result.append({ + 'task_type':'语义匹配', + 'texta':texta, + 'textb':textb, + 'question':question, + 'choice':choice, + 'answer':answer, + 'label':label, + 'id':text_id}) + return result + + +def save_data(data,file_path): + with open(file_path, 'w', encoding='utf8') as f: + for line in data: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + + +if __name__=="__main__": + + parser = argparse.ArgumentParser(description="train") + parser.add_argument("--data_path", type=str,default="") + parser.add_argument("--save_path", type=str,default="") + + args = parser.parse_args() + + + data_path = args.data_path + save_path = args.save_path + + if not os.path.exists(save_path): + os.makedirs(save_path) + + file_list = ['train','dev','test'] + for file in file_list: + file_path = os.path.join(data_path,file+'.json') + output_path = os.path.join(save_path,file+'.json') + save_data(load_data(file_path),output_path) diff --git a/fengshen/examples/clue1.1/data_preprocessing/c3_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/c3_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..46f8c268cf829384cc05d2b4c3c01e826d1ad892 --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/c3_preprocessing.py @@ -0,0 +1,72 @@ +import json +from tqdm import tqdm +import os +import argparse + + +def load_data(file_path,is_training=False): + with open(file_path, 'r', encoding='utf8') as f: + lines = json.loads(''.join(f.readlines())) + result=[] + for line in tqdm(lines): + data = line + texta = '\n'.join(data[0]) + textb ='' + for qa in data[1]: + question=qa['question'] + choice=qa['choice'] + answer=qa['answer'] if 'answer' in qa.keys() else '' + label = qa['choice'].index(answer) if 'answer' in qa.keys() else 0 + text_id = qa['id'] if 'id' in qa.keys() else 0 + result.append({'texta':texta, + 'textb':textb, + 'question':question, + 'choice':choice, + 'answer':answer, + 'label':label, + 'id':text_id}) + return result + + +def save_data(data,file_path): + with open(file_path, 'w', encoding='utf8') as f: + for line in data: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + + + + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="train") + parser.add_argument("--data_path", type=str,default="") + parser.add_argument("--save_path", type=str,default="") + + args = parser.parse_args() + + + data_path = args.data_path + save_path = args.save_path + + if not os.path.exists(save_path): + os.makedirs(save_path) + + file_list=['d-train','d-dev','c3-m-train','m-train','m-dev','test1.0','test1.1'] + train_data = [] + dev_data = [] + for file in file_list: + file_path = os.path.join(data_path,file+'.json') + data=load_data(file_path=file_path) + if 'train' in file or 'd-dev' in file: + train_data.extend(data) + elif 'm-dev' in file: + dev_data.extend(data) + elif 'test' in file: + output_path = os.path.join(save_path,file+'.json') + save_data(data,output_path) + + output_path = os.path.join(save_path,'train.json') + save_data(train_data,output_path) + + output_path = os.path.join(save_path,'dev.json') + save_data(dev_data,output_path) \ No newline at end of file diff --git a/fengshen/examples/clue1.1/data_preprocessing/chid_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/chid_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..e55aaf9b1c4ceed02343c5417aa205e570fef26c --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/chid_preprocessing.py @@ -0,0 +1,159 @@ +import json +from tqdm import tqdm +import os +import re +import argparse + +mask_token='[MASK]' +label_mask='__' + + +def load_schema(train_answer,dev_answer): + with open(train_answer,'r',encoding='utf-8') as f: + train2id = json.loads(''.join(f.readlines())) + + with open(dev_answer,'r',encoding='utf-8') as f: + dev2id = json.loads(''.join(f.readlines())) + for k,v in dev2id.items(): + train2id[k]=v + + return train2id + + +def cut(sentence): + """ + 将一段文本切分成多个句子 + :param sentence: ['虽然BillRoper正忙于全新游戏 + :return: ['虽然BillRoper正..接近。' , '与父母,之首。' , '很多..常见。' , '”一位上..推进。' , ''”一直坚..市场。'' , '如今,...的70%。'] + """ + new_sentence = [] + sen = [] + for i in sentence: # 虽 + sen.append(i) + if i in ['。', '!', '?', '?',',',',']: + new_sentence.append("".join(sen)) #['虽然BillRoper正...接近。' , '与父母,...之首。' , ] + sen = [] + + if len(new_sentence) <= 1: # 一句话超过max_seq_length且没有句号的,用","分割,再长的不考虑了。 + new_sentence = [] + sen = [] + for i in sentence: + sen.append(i) + if i.split(' ')[0] in [',', ','] and len(sen) != 0: + new_sentence.append("".join(sen)) + sen = [] + + if len(sen) > 0: # 若最后一句话无结尾标点,则加入这句话 + new_sentence.append("".join(sen)) + return new_sentence + + +def get_answer_text(text,m): + sent_list=cut(text) + text1='' + text2='' + for i,sent in enumerate(sent_list): + if m in sent: + text1=''.join(sent_list[:i]) + if i+1>len(sent_list)-1: + text2='' + else: + text2=''.join(sent_list[i+1:]) + index_text=sent + return text1,text2,index_text + return '','','' + + + +def load_data(file_path,label2id): + with open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + result=[] + for l,line in tqdm(enumerate(lines)): + data = json.loads(line) + choice=data['candidates'] + for s,sent in enumerate(data['content']): + masks=re.findall("#idiom\d{6}#", sent) + for m in masks: + text1,text2,index_text=get_answer_text(sent,m) + + masks1=re.findall("#idiom\d{6}#", text1) + for m1 in masks1: + text1=text1.replace(m1,choice[label2id[m1]]) + + masks2=re.findall("#idiom\d{6}#", text2) + for m2 in masks2: + text2=text2.replace(m2,choice[label2id[m2]]) + + masks3=re.findall("#idiom\d{6}#", index_text) + for m3 in masks3: + if m3!=m: + index_text=index_text.replace(m3,choice[label2id[m3]]) + + choice=[] + for c in data['candidates']: + choice.append(index_text.replace(m,c)) + + if len('.'.join(choice))>400: + choice=data['candidates'] + text1=text1+index_text.split(m)[0] + text2=index_text.split(m)[1]+text2 + + if len(text1)+len(text2)>512-len('.'.join(choice)): + split1=0 + split2=0 + while split1+split2<512-len('.'.join(choice)): + if split1= len(context): + return results + n += 1 + + +def load_data(file_path,is_training=False): + task_type='抽取任务' + subtask_type='抽取式阅读理解' + with open(file_path, 'r', encoding='utf8') as f: + lines = json.loads(''.join(f.readlines())) + result=[] + lines = lines['data'] + for line in tqdm(lines): + if line['paragraphs']==[]: + continue + data = line['paragraphs'][0] + context=data['context'].strip() + for qa in data['qas']: + question=qa['question'].strip() + rcv=[] + for a in qa['answers']: + if a not in rcv: + rcv.append(a) + split=stride_split(question, context, a['text'], a['answer_start']) + for sp in split: + choices = [] + + choice = {} + choice['id']=qa['id'] + choice['entity_type'] = qa['question'] + choice['label']=0 + entity_list=[] + if sp[3]>=0 and sp[4]>=0: + entity_list.append({'entity_name':sp[2],'entity_type':'','entity_idx':[[sp[3],sp[4]]]}) + + choice['entity_list']=entity_list + choices.append(choice) + + if choices==[]: + print(data) + continue + result.append({ 'task_type':task_type, + 'subtask_type':subtask_type, + 'text':sp[1], + 'choices':choices, + 'id':0}) + + return result + + +def save_data(data,file_path): + with open(file_path, 'w', encoding='utf8') as f: + for line in data: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + + + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="train") + parser.add_argument("--data_path", type=str,default="") + parser.add_argument("--save_path", type=str,default="") + + args = parser.parse_args() + + + data_path = args.data_path + save_path = args.save_path + + if not os.path.exists(save_path): + os.makedirs(save_path) + + file_list=['dev','train','trial','test'] + train_data = [] + dev_data = [] + for file in file_list: + file_path = os.path.join(data_path,file+'.json') + data=load_data(file_path=file_path) + if 'train' in file or 'trial' in file: + train_data.extend(data) + else: + output_path = os.path.join(save_path,file+'.json') + save_data(data,output_path) + + output_path = os.path.join(save_path,'train.json') + save_data(train_data,output_path) + \ No newline at end of file diff --git a/fengshen/examples/clue1.1/data_preprocessing/csl_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/csl_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..2762c4a82cc32fcd353d93f12a241bc900ef4624 --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/csl_preprocessing.py @@ -0,0 +1,88 @@ +import json +from tqdm import tqdm +import os +import jieba.analyse +import argparse + + +label2desc={'1':'可以','0':'不能'} + +def load_data(file_path,is_training=False): + with open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + result=[] + for line in tqdm(lines): + data = json.loads(line) + texta = data['abst'] + abst = data['abst'] + textb = '' + keyword = '、'.join(data['keyword']) + question = '' + + + keyword=data['keyword'] + rs=jieba.analyse.extract_tags(data['abst'],topK=15) + texta='、'.join(rs)+'。'+texta + comm=[] + for k in keyword: + if k in rs: + comm.append(k) + + for word in comm: + if word in abst: + abst=abst.replace(word,word+'(共现关键字)') + + comm=[word for word in comm] + keyword=[word for word in data['keyword']] + + comm_text='共现词汇'+str(len(comm))+'个,分别是'+'、'.join(comm) + + keyword = '、'.join(keyword) + question='' + + + choice = [f'{v}使用{keyword}概括摘要' for k,v in label2desc.items()] + answer = label2desc[data['label']] if 'label' in data.keys() else '' + answer = f'{answer}使用{keyword}概括摘要' + + label = choice.index(answer) if 'label' in data.keys() else 0 + text_id = data['id'] if 'id' in data.keys() else 0 + result.append({'texta':texta, + 'textb':textb, + 'question':question, + 'choice':choice, + 'answer':answer, + 'label':label, + 'id':text_id}) + for i in range(5): + print(result[i]) + return result + + +def save_data(data,file_path): + with open(file_path, 'w', encoding='utf8') as f: + for line in data: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + + + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="train") + parser.add_argument("--data_path", type=str,default="") + parser.add_argument("--save_path", type=str,default="") + + args = parser.parse_args() + + + data_path = args.data_path + save_path = args.save_path + + if not os.path.exists(save_path): + os.makedirs(save_path) + + file_list = ['train','dev','test'] + for file in file_list: + file_path = os.path.join(data_path,file+'.json') + output_path = os.path.join(save_path,file+'.json') + save_data(load_data(file_path),output_path) diff --git a/fengshen/examples/clue1.1/data_preprocessing/iflytek_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/iflytek_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..6a8f5ec44851697ac1a36f299a0a132dcf486b71 --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/iflytek_preprocessing.py @@ -0,0 +1,188 @@ +import json +from tqdm import tqdm +import os +import argparse + +label2desc={ + '银行': '银行', + '社区服务': '社区', + '电商': '电商', + '支付': '支付', + '经营养成': '养成', + '卡牌': '卡牌', + '借贷': '借贷', + '驾校': '驾校', + '理财': '理财', + '职考': '职考', + '新闻': '新闻', + '旅游资讯': '旅游', + '公共交通': '交通', + '魔幻': '魔幻', + '医疗服务': '医疗', + '影像剪辑': '影像', + '动作类': '动作', + '工具': '工具', + '体育竞技': '体育', + '小说': '小说', + '运动健身': '运动', + '相机': '相机', + '辅助工具': '辅助', + '快递物流': '快递', + '高等教育': '教育', + '股票': '股票', + '菜谱': '菜谱', + '行车辅助': '行车', + '仙侠': '仙侠', + '亲子儿童': '亲子', + '购物咨询': '购物', + '射击游戏': '射击', + '漫画': '漫画', + '中小学': '小学', + '同城服务': '同城', + '成人教育': '成人', + '求职': '求职', + '电子产品': '电子', + '艺术': '艺术', + '薅羊毛': '赚钱', + '约会社交': '约会', + '经营': '经营', + '兼职': '兼职', + '短视频': '短视', + '音乐': '音乐', + '英语': '英语', + '棋牌中心': '棋牌', + '摄影修图': '摄影', + '养生保健': '养生', + '办公': '办公', + '政务': '政务', + '视频': '视频', + '论坛圈子': '论坛', + '彩票': '彩票', + '直播': '直播', + '其他': '其他', + '休闲益智': '休闲', + '策略': '策略', + '即时通讯': '通讯', + '汽车交易': '买车', + '违章': '违章', + '地图导航': '地图', + '民航': '民航', + '电台': '电台', + '语言(非英语)': '语言', + '搞笑': '搞笑', + '婚恋社交': '婚恋', + '社区超市': '超市', + '日常养车': '养车', + '杂志': '杂志', + '视频教育': '在线', + '家政': '家政', + '影视娱乐': '影视', + '装修家居': '装修', + '体育咨讯': '资讯', + '社交工具': '社交', + '餐饮店': '餐饮', + '美颜': '美颜', + '问诊挂号': '挂号', + '飞行空战': '飞行', + '综合预定': '预定', + '电影票务': '票务', + '笔记': '笔记', + '买房': '买房', + '外卖': '外卖', + '母婴': '母婴', + '打车': '打车', + '情侣社交': '情侣', + '日程管理': '日程', + '租车': '租车', + '微博博客': '博客', + '百科': '百科', + '绘画': '绘画', + '铁路': '铁路', + '生活社交': '生活', + '租房': '租房', + '酒店': '酒店', + '保险': '保险', + '问答交流': '问答', + '收款': '收款', + 'MOBA': '竞技', + 'K歌': '唱歌', + '技术': '技术', + '减肥瘦身': '减肥', + '工作社交': '工作', + '团购': '团购', + '记账': '记账', + '女性': '女性', + '公务员': '公务', + '二手': '二手', + '美妆美业': '美妆', + '汽车咨询': '汽车', + '行程管理': '行程', + '免费WIFI': '免费', + '教辅': '教辅', + '成人': '两性', + '出国': '出国', + '婚庆': '婚庆', + '民宿短租': '民宿'} + +choice = [k for k,v in label2desc.items()] +print('1'.join(choice)) +print(len('1'.join(choice))) + + +def load_data(file_path,is_training=False): + with open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + result=[] + for line in tqdm(lines): + data = json.loads(line) + texta = data['sentence'] + textb = '' + question = '请问app应用属于?' + + choice = [v for k,v in label2desc.items()] + answer = label2desc[data['label_des']] if 'label_des' in data.keys() else '' + + # choice = [k for k,v in label2desc.items()] + # answer = data['label_des'] if 'label_des' in data.keys() else '' + + label = choice.index(answer) if 'label_des' in data.keys() else 0 + text_id = data['id'] if 'id' in data.keys() else 0 + result.append({'texta':texta, + 'textb':textb, + 'question':question, + 'choice':choice, + 'answer':answer, + 'label':label, + 'id':text_id}) + # for i in range(5): + # print(result[i]) + return result + + +def save_data(data,file_path): + with open(file_path, 'w', encoding='utf8') as f: + for line in data: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + + + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="train") + parser.add_argument("--data_path", type=str,default="") + parser.add_argument("--save_path", type=str,default="") + + args = parser.parse_args() + + + data_path = args.data_path + save_path = args.save_path + + if not os.path.exists(save_path): + os.makedirs(save_path) + + file_list = ['train','dev','test'] + for file in file_list: + file_path = os.path.join(data_path,file+'.json') + output_path = os.path.join(save_path,file+'.json') + save_data(load_data(file_path),output_path) \ No newline at end of file diff --git a/fengshen/examples/clue1.1/data_preprocessing/ocnli_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/ocnli_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..344a8ea7b7049b9f4373ad4c36dc284c395b0034 --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/ocnli_preprocessing.py @@ -0,0 +1,60 @@ +import json +from tqdm import tqdm +import os +import argparse + + +label2desc={'contradiction':'矛盾','neutral':'自然','entailment':'蕴含'} + +def load_data(file_path,is_training=False): + with open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + result=[] + for line in tqdm(lines): + data = json.loads(line) + texta = data['sentence1'] + textb = data['sentence2'] + question = '' + choice = [v for k,v in label2desc.items()] + answer = label2desc[data['label']] if 'label' in data.keys() else '' + label = choice.index(answer) if 'label' in data.keys() else 0 + text_id = data['id'] if 'id' in data.keys() else 0 + result.append({'task_type':'自然语言推理', + 'texta':texta, + 'textb':textb, + 'question':question, + 'choice':choice, + 'answer':answer, + 'label':label, + 'id':text_id}) + for i in range(5): + print(result[i]) + return result + + +def save_data(data,file_path): + with open(file_path, 'w', encoding='utf8') as f: + for line in data: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="train") + parser.add_argument("--data_path", type=str,default="") + parser.add_argument("--save_path", type=str,default="") + + args = parser.parse_args() + + + data_path = args.data_path + save_path = args.save_path + + if not os.path.exists(save_path): + os.makedirs(save_path) + + file_list = ['train','dev','test'] + for file in file_list: + file_path = os.path.join(data_path,file+'.json') + output_path = os.path.join(save_path,file+'.json') + save_data(load_data(file_path),output_path) diff --git a/fengshen/examples/clue1.1/data_preprocessing/tnews_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/tnews_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..9f187fac71b411d77273a1a45544eb9c35151bc9 --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/tnews_preprocessing.py @@ -0,0 +1,71 @@ +import json +from tqdm import tqdm +import argparse + +label2desc={"news_story": "故事", + "news_culture": "文化", + "news_entertainment": "娱乐", + "news_sports": "体育", + "news_finance": "财经", + "news_house": "房产", + "news_car": "汽车", + "news_edu": "教育", + "news_tech": "科技", + "news_military": "军事", + "news_travel": "旅游", + "news_world": "国际", + "news_stock": "股票", + "news_agriculture": "农业", + "news_game": "电竞"} + +def load_data(file_path,is_training=False): + with open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + result=[] + for line in tqdm(lines): + data = json.loads(line) + texta = data['sentence'] + textb = '' + question = '下面新闻属于哪一个类别?' + choice = [v for k,v in label2desc.items()] + answer = label2desc[data['label_desc']] if 'label_desc' in data.keys() else '' + label = choice.index(answer) if 'label_desc' in data.keys() else 0 + text_id = data['id'] if 'id' in data.keys() else 0 + result.append({'texta':texta, + 'textb':textb, + 'question':question, + 'choice':choice, + 'answer':answer, + 'label':label, + 'id':text_id}) + print(result[0]) + return result + + +def save_data(data,file_path): + with open(file_path, 'w', encoding='utf8') as f: + for line in data: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + +import os + +if __name__=="__main__": + parser = argparse.ArgumentParser(description="train") + parser.add_argument("--data_path", type=str,default="") + parser.add_argument("--save_path", type=str,default="") + + args = parser.parse_args() + + + data_path = args.data_path + save_path = args.save_path + + if not os.path.exists(save_path): + os.makedirs(save_path) + + file_list = ['train','dev','test1.0','test1.1'] + for file in file_list: + file_path = os.path.join(data_path,file+'.json') + output_path = os.path.join(save_path,file+'.json') + save_data(load_data(file_path),output_path) \ No newline at end of file diff --git a/fengshen/examples/clue1.1/data_preprocessing/wsc_preprocessing.py b/fengshen/examples/clue1.1/data_preprocessing/wsc_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..c9b5ec0a7625ac870e71e77cf8af6256e0a1609c --- /dev/null +++ b/fengshen/examples/clue1.1/data_preprocessing/wsc_preprocessing.py @@ -0,0 +1,81 @@ +import json +from tqdm import tqdm +import os +import argparse + +label2desc={'true':'是','false':'不是'} + + +def load_data(file_path,is_training=False): + with open(file_path, 'r', encoding='utf8') as f: + lines = f.readlines() + result=[] + for line in tqdm(lines): + data = json.loads(line) + target = data['target'] + text=list(data['text']) + if target['span2_index']1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=3G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + +#SBATCH --requeue +#SBATCH --qos=preemptive + + +DATA_DIR=./data/cmrc2018 #数据集路径 + +PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-Ubert-110M-Chinese + +CHECKPOINT_PATH=./checkpoints + +LOAD_CHECKPOINT_PATH=./checkpoints/last.ckpt + +OUTPUT_PATH=./predict/cmrc2018_predict.json + +DEFAULT_ROOT_DIR=./log + + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data dev.json \ + --batchsize 32 \ + --max_length 314 \ + " + +MODEL_ARGS="\ + --learning_rate 0.00002 \ + --weight_decay 0.1 \ + --warmup 0.01 \ + --num_labels 1 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_span_acc \ + --save_top_k 5 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only true \ + --checkpoint_path $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_span_acc:.4f} \ + " + +#--load_checkpoints_path $LOAD_CHECKPOINT_PATH \ +TRAINER_ARGS="\ + --max_epochs 11 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --gradient_clip_val 0.25 \ + --val_check_interval 0.05 \ + --limit_val_batches 100 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_path $OUTPUT_PATH \ + --threshold 0.001 \ + --train \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +SCRIPT_PATH=./solution/clue_ubert.py +python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/clue1.1/run_clue_unimc.sh b/fengshen/examples/clue1.1/run_clue_unimc.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d1f576211ddab72bb12a7ef63a7c026754b611b --- /dev/null +++ b/fengshen/examples/clue1.1/run_clue_unimc.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#SBATCH --job-name=slurm-test # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --mem-per-cpu=4G # memory per cpu-core (4G is default) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. + +#SBATCH --requeue +#SBATCH --qos=preemptive + +TASK=tnews #clue 上的任务 ,可选afqmc、tnews、iflytek、wsc、ocnli、csl、chid、c3 +DATA_ROOT_PATH=./data #数据集路径 +DATA_DIR=$DATA_ROOT_PATH/$TASK + +PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese #预训练模型的路径 + +CHECKPOINT_PATH=./checkpoint #训练模型保存的路径 + +LOAD_CHECKPOINT_PATH=./checkpoints/last.ckpt #加载预训练好的模型 + +OUTPUT_PATH=./predict/${TASK}_predict.json + +DEFAULT_ROOT_DIR=./log # 模型日志输出路径 + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test1.1.json \ + --batchsize 1 \ + --max_length 512 \ + " + +# 如果使用的是 UniMC-DeBERTa-1.4B模型,学习率要设置1e-6 + +MODEL_ARGS="\ + --learning_rate 0.000002 \ + --weight_decay 0.1 \ + --warmup 0.06 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_ckpt_path $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 17 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --gradient_clip_val 0.25 \ + --default_root_dir $DEFAULT_ROOT_DIR \ + " + +#--load_checkpoints_path $LOAD_CHECKPOINT_PATH \ 如果想加载预训练好的ckpt模型,可以使用这个参数加载 + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --output_path $OUTPUT_PATH \ + --train \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +SCRIPT_PATH=./solution/clue_unimc.py +python3 $SCRIPT_PATH $options + diff --git a/fengshen/examples/clue1.1/solution/clue_ubert.py b/fengshen/examples/clue1.1/solution/clue_ubert.py new file mode 100644 index 0000000000000000000000000000000000000000..97b3ed7b5a4eb9dff9dda0d9131ae206d55d1c2f --- /dev/null +++ b/fengshen/examples/clue1.1/solution/clue_ubert.py @@ -0,0 +1,46 @@ +import argparse +from fengshen import UbertPipelines +import os +import json +from tqdm import tqdm + +def load_data(data_path): + with open(data_path, 'r', encoding='utf8') as f: + lines = f.readlines() + samples = [json.loads(line) for line in tqdm(lines)] + return samples + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser.add_argument('--data_dir', default='./data', type=str) + total_parser.add_argument('--train_data', default='train.json', type=str) + total_parser.add_argument('--valid_data', default='dev.json', type=str) + total_parser.add_argument('--test_data', default='test.json', type=str) + total_parser.add_argument('--output_path',default='./predict.json', type=str) + + total_parser = UbertPipelines.pipelines_args(total_parser) + args = total_parser.parse_args() + + train_data = load_data(os.path.join(args.data_dir, args.train_data)) + dev_data = load_data(os.path.join(args.data_dir, args.valid_data)) + test_data = load_data(os.path.join(args.data_dir, args.test_data)) + + # test_data = test_data[:10] + + model = UbertPipelines(args) + if args.train: + model.fit(train_data, dev_data) + + result = model.predict(test_data) + for line in result[:20]: + print(line) + + with open(args.output_path, 'w', encoding='utf8') as f: + for line in result: + json_data = json.dumps(line, ensure_ascii=False) + f.write(json_data+'\n') + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/clue1.1/solution/clue_unimc.py b/fengshen/examples/clue1.1/solution/clue_unimc.py new file mode 100644 index 0000000000000000000000000000000000000000..a5ffe4899e31216326260a65d9d12ad7892fc60f --- /dev/null +++ b/fengshen/examples/clue1.1/solution/clue_unimc.py @@ -0,0 +1,63 @@ +import argparse +from fengshen.pipelines.multiplechoice import UniMCPipelines +import os +import json +import copy +from tqdm import tqdm + +def load_data(data_path): + with open(data_path, 'r', encoding='utf8') as f: + lines = f.readlines() + samples = [json.loads(line) for line in tqdm(lines)] + return samples + + +def comp_acc(pred_data,test_data): + corr=0 + for i in range(len(pred_data)): + if pred_data[i]['label']==test_data[i]['label']: + corr+=1 + return corr/len(pred_data) + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser.add_argument('--data_dir', default='./data', type=str) + total_parser.add_argument('--train_data', default='train.json', type=str) + total_parser.add_argument('--valid_data', default='dev.json', type=str) + total_parser.add_argument('--test_data', default='test.json', type=str) + total_parser.add_argument('--output_path', default='', type=str) + + total_parser = UniMCPipelines.piplines_args(total_parser) + args = total_parser.parse_args() + + train_data = load_data(os.path.join(args.data_dir, args.train_data)) + dev_data = load_data(os.path.join(args.data_dir, args.valid_data)) + test_data = load_data(os.path.join(args.data_dir, args.test_data)) + + # dev_data = dev_data[:200] + dev_data_ori=copy.deepcopy(dev_data) + + model = UniMCPipelines(args, args.pretrained_model_path) + + print(args.data_dir) + + if args.train: + model.train(train_data, dev_data) + result = model.predict(dev_data) + for line in result[:20]: + print(line) + + acc=comp_acc(result,dev_data_ori) + print('acc:',acc) + + if args.output_path != '': + test_result = model.predict(test_data) + with open(args.output_path, 'w', encoding='utf8') as f: + for line in test_result: + json_data=json.dumps(line,ensure_ascii=False) + f.write(json_data+'\n') + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/clue_sim/README.md b/fengshen/examples/clue_sim/README.md new file mode 100644 index 0000000000000000000000000000000000000000..41b5b72129491139fa6f21e7cc2ea07d027a60c3 --- /dev/null +++ b/fengshen/examples/clue_sim/README.md @@ -0,0 +1,90 @@ +# 二郎神打CLUE语义匹配榜 + - [比赛介绍](#比赛介绍) + - [clue语义匹配榜打榜思路](#clue语义匹配榜-打榜思路) + - [数据集介绍](#数据集介绍) + - [环境](#环境) + - [用法](#用法) + - [提交](#提交) + +## 比赛介绍 +- clue的语义匹配榜 (https://www.cluebenchmarks.com/sim.html) +- clue sim官方实例 (https://github.com/CLUEbenchmark/QBQTC) + +## clue语义匹配榜 打榜思路 + +- 直接使用fengshenbang的二郎神模型,就打到了前三。 +- 为了解决标签平衡问题,设计了一个交叉熵平滑滤波loss,就达到了第一。 + +详细的思路讲解在知乎: 链接 + +## 数据集介绍 + +QQ浏览器搜索相关性数据集(QBQTC,QQ Browser Query Title Corpus),是QQ浏览器搜索引擎目前针对大搜场景构建的一个融合了相关性、权威性、内容质量、 +时效性等维度标注的学习排序(LTR)数据集,广泛应用在搜索引擎业务场景中。 + +相关性的含义:0,相关程度差;1,有一定相关性;2,非常相关。数字越大相关性越高。 + +**数据量统计** + +| 训练集(train) | 验证集(dev) | 公开测试集(test_public) | 私有测试集(test) | +| :----: | :----: | :----: | :----: | +| 180,000| 20,000| 5,000 | >=10,0000| + +**评测指标** + +f1_score来自于sklearn.metrics,计算公式如下: +`F1 = 2 * (precision * recall) / (precision + recall)` + +## 环境 +* Python >= 3.6 +* torch == 1.8.0+cu111 +* transforms == 4.6.0 +* pytorch-lightning == 1.3.2 +* 一张GPU: A100 40G + +## 用法 + +fengshenbang的二郎神模型的使用是非常简单的。 + +该example下的代码和思想继承自fengshen/examples/classification/finetune_classification.py + +如果需要直接使用该python脚本,把官方的数据集处理成如下形式: + +```json +{"sentence1": "应届生实习", "sentence2": "实习生招聘-应届生求职网", "label": "1", "id": 0} +``` + +然后修改其中的fengshen/examples/classification/finetune_classification.sh的参数即可。 + +下面介绍该example的用法: + +### 创建文件夹 + +- dataset 文件夹,下载官方数据集后放进来就行 +- weights 文件夹,用以存放二郎神模型 +- submissions 文件夹,用以存放需要评测的json文件 + +### Train +```bash +python main.py \ + --mode 'Train' \ + --model_path './weights/Erlangshen-MegatronBert-1.3B-Similarity' \ + --model_name 'IDEA-CCNL/Erlangshen-MegatronBert-1.3B-Similarity' +``` + +加载最优的模型用以Test set的预测。 + +### Test +```bash +python main.py \ + --mode 'Test' \ + --predict_model_path 'your_model_path' \ + --model_path './weights/Erlangshen-MegatronBert-1.3B-Similarity' \ + --model_name 'IDEA-CCNL/Erlangshen-MegatronBert-1.3B-Similarity' +``` + +## 提交 + +在路径 ./submissions 下,找到 qbqtc_predict.json 并且提交到测评系统 + +注意:名字必须为qbqtc_predict.json diff --git a/fengshen/examples/clue_sim/__init__.py b/fengshen/examples/clue_sim/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fengshen/examples/clue_sim/finetune_clue_sim.py b/fengshen/examples/clue_sim/finetune_clue_sim.py new file mode 100644 index 0000000000000000000000000000000000000000..b05f6ea6ce67c35cd39dedd924df0b663fd5a8b2 --- /dev/null +++ b/fengshen/examples/clue_sim/finetune_clue_sim.py @@ -0,0 +1,325 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +from sklearn import metrics +import torch +import torch.nn as nn +from torch.utils.data import Dataset, DataLoader, ConcatDataset +import pytorch_lightning as pl +from collections import defaultdict +from transformers import AutoConfig, AutoModel, get_cosine_schedule_with_warmup +from loss import FocalLoss, LabelSmoothingCorrectionCrossEntropy + + +class CustomDataset(Dataset): + def __init__(self, file, tokenizer, max_len, mode='no_test'): + self.tokenizer = tokenizer + self.max_len = max_len + self.mode = mode + + self.ex_list = [] + with open('./dataset/' + file, "r", encoding='utf-8') as f: + for line in f: + sample = json.loads(line) + query = sample["query"] + title = sample["title"] + id = int(sample["id"]) + if self.mode == 'no_test': + relevant = int(sample["label"]) + self.ex_list.append((query, title, relevant, id)) + else: + self.ex_list.append((query, title, id)) + + def __len__(self): + return len(self.ex_list) + + def __getitem__(self, index): + if self.mode == 'no_test': + query, title, relevant, id = self.ex_list[index] + else: + query, title, id = self.ex_list[index] + + inputs = self.tokenizer.encode_plus( + query, title, + truncation=True, + add_special_tokens=True, + max_length=self.max_len, + padding='max_length', + return_token_type_ids=True + ) + ids = inputs['input_ids'] + mask = inputs['attention_mask'] + token_type_ids = inputs["token_type_ids"] + if self.mode == 'no_test': + return { + 'ids': torch.tensor(ids, dtype=torch.long), + 'mask': torch.tensor(mask, dtype=torch.long), + 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), + 'targets': torch.tensor(relevant, dtype=torch.float), + 'id': torch.tensor(id, dtype=torch.long) + } + else: + return { + 'ids': torch.tensor(ids, dtype=torch.long), + 'mask': torch.tensor(mask, dtype=torch.long), + 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), + 'id': torch.tensor(id, dtype=torch.long) + } + + +class CustomDataModule(pl.LightningDataModule): + def __init__(self, args, tokenizer): + super().__init__() + self.args = args + self.tokenizer = tokenizer + self.max_len = self.args.max_seq_length + self.train_dataset = None + self.val_dataset = None + + def setup(self, stage): + data_path = "./dataset" + assert os.path.exists(os.path.join(data_path, 'train.json')) + assert os.path.exists(os.path.join(data_path, 'dev.json')) + assert os.path.exists(os.path.join(data_path, 'test_public.json')) + if stage == 'fit': + self.train_dataset = CustomDataset('train.json', self.tokenizer, self.max_len) + self.val_dataset = CustomDataset('dev.json', self.tokenizer, self.max_len) + self.test_dataset = CustomDataset('test_public.json', self.tokenizer, self.max_len) + elif stage == 'test': + self.test_dataset = CustomDataset('test_public.json', self.tokenizer, self.max_len) + + def train_dataloader(self): + full_dataset = ConcatDataset([self.train_dataset, self.val_dataset]) + train_dataloader = DataLoader( + full_dataset, + batch_size=self.args.batch_size, + num_workers=4, + shuffle=True, + pin_memory=True, + drop_last=True) + return train_dataloader + + def val_dataloader(self): + val_dataloader = DataLoader( + self.test_dataset, + batch_size=self.args.val_batch_size, + num_workers=4, + shuffle=False, + pin_memory=True, + drop_last=False) + return val_dataloader + + def test_dataloader(self): + test_dataloader = DataLoader( + self.test_dataset, + batch_size=self.args.val_batch_size, + num_workers=4, + shuffle=False, + pin_memory=True, + drop_last=False) + return test_dataloader + + +class CustomModel(pl.LightningModule): + def __init__(self, args): + super().__init__() + self.args = args + self.model = self.args.model_name + self.cache_dir = self.args.model_path + self.scheduler = self.args.scheduler + self.step_scheduler_after = "batch" + self.optimizer = self.args.optimizer + self.pooler = self.args.use_original_pooler + self.category = self.args.cate_performance + self.loss_func = self.args.loss_function + + hidden_dropout_prob: float = 0.1 + layer_norm_eps: float = 1e-7 + + config = AutoConfig.from_pretrained(self.model, cache_dir=self.cache_dir) + + config.update( + { + "output_hidden_states": False, + "hidden_dropout_prob": hidden_dropout_prob, + "layer_norm_eps": layer_norm_eps, + } + ) + self.transformer = AutoModel.from_pretrained(self.model, config=config, cache_dir=self.cache_dir) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.linear = torch.nn.Linear(config.hidden_size, self.args.num_labels, bias=True) # 分三类 + + def configure_optimizers(self): + """Prepare optimizer and schedule""" + model = self.transformer + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": 0.01, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + + optimizer_index = ['Adam', 'AdamW'].index(self.optimizer) + optimizer = [ + torch.optim.Adam(optimizer_grouped_parameters, lr=self.args.learning_rate), + torch.optim.AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate)][optimizer_index] + + scheduler_index = ['StepLR', 'CosineWarmup', 'CosineAnnealingLR'].index(self.scheduler) + scheduler = [ + torch.optim.lr_scheduler.StepLR(optimizer, step_size=self.args.warmup_step, + gamma=self.args.warmup_proportion), + get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=int(self.args.warmup_proportion * self.total_steps), + num_training_steps=self.total_steps, + ), + torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5, eta_min=2e-06)][scheduler_index] + + scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1} + return [optimizer], [scheduler] + + def setup(self, stage=None): + if stage != "fit": + return + # calculate total steps + train_dataloader = self.trainer.datamodule.train_dataloader() + gpus = 0 if self.trainer.gpus is None else self.trainer.gpus + tb_size = self.args.batch_size * max(1, gpus) + ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs) + self.total_steps = (len(train_dataloader.dataset) // tb_size) // ab_size + + def loss(self, outputs, targets): + lossf_index = ['CE', 'Focal', 'LSCE_correction'].index(self.loss_func) + loss_fct = [nn.CrossEntropyLoss(), FocalLoss(), LabelSmoothingCorrectionCrossEntropy()][lossf_index] + loss = loss_fct(outputs, targets) + return loss + + def category_performance_measure(self, labels_right, labels_pred, num_label=3): + text_labels = [i for i in range(num_label)] + + TP = dict.fromkeys(text_labels, 0) # 预测正确的各个类的数目 + TP_FP = dict.fromkeys(text_labels, 0) # 测试数据集中各个类的数目 + TP_FN = dict.fromkeys(text_labels, 0) # 预测结果中各个类的数目 + + label_dict = defaultdict(list) + for num in range(num_label): + label_dict[num].append(str(num)) + + # 计算TP等数量 + for i in range(0, len(labels_right)): + TP_FP[labels_right[i]] += 1 + TP_FN[labels_pred[i]] += 1 + if labels_right[i] == labels_pred[i]: + TP[labels_right[i]] += 1 + + # 计算准确率P,召回率R,F1值 + results = [] + for key in TP_FP: + P = float(TP[key]) / float(TP_FP[key] + 1e-9) + R = float(TP[key]) / float(TP_FN[key] + 1e-9) + F1 = P * R * 2 / (P + R) if (P + R) != 0 else 0 + # results.append("%s:\t P:%f\t R:%f\t F1:%f" % (key, P, R, F1)) + results.append(F1) + return results + + def monitor_metrics(self, outputs, targets): + pred = torch.argmax(outputs, dim=1).cpu().numpy().tolist() + targets = targets.int().cpu().numpy().tolist() + if self.category: + category_results = self.category_performance_measure( + labels_right=targets, + labels_pred=pred, + num_label=self.args.num_labels + ) + return {"f1": category_results} + else: + f1_score = metrics.f1_score(targets, pred, average="macro") + return {"f1": f1_score} + + def forward(self, ids, mask, token_type_ids, labels): + transformer_out = self.transformer(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids) + + if self.pooler: + pooler_output = transformer_out.pooler_output + else: + sequence_output = transformer_out.last_hidden_state + pooler_output = torch.mean(sequence_output, dim=1) + logits = self.linear(self.dropout(pooler_output)) + + labels_hat = torch.argmax(logits, dim=1) + correct_count = torch.sum(labels == labels_hat) + return logits, correct_count + + def predict(self, ids, mask, token_type_ids): + transformer_out = self.transformer(input_ids=ids, attention_mask=mask, token_type_ids=token_type_ids) + pooler_output = transformer_out.pooler_output + logits = self.linear(self.dropout(pooler_output)) + logits = torch.argmax(logits, dim=1) + return logits + + def training_step(self, batch, batch_idx): + ids, mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['targets'] + logits, correct_count = self.forward(ids, mask, token_type_ids, labels) + loss = self.loss(logits, labels.long()) + f1 = self.monitor_metrics(logits, labels)["f1"] + self.log("train_loss", loss, logger=True, prog_bar=True) + self.log('train_acc', correct_count.float() / len(labels), logger=True, prog_bar=True) + if self.category: + self.log("train_f1_key0", f1[0], logger=True, prog_bar=True) + self.log("train_f1_key1", f1[1], logger=True, prog_bar=True) + self.log("train_f1_key2", f1[2], logger=True, prog_bar=True) + else: + self.log("train_f1", f1, logger=True, prog_bar=True) + return loss + + def validation_step(self, batch, batch_idx): + ids, mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['targets'] + logits, correct_count = self.forward(ids, mask, token_type_ids, labels) + loss = self.loss(logits, labels.long()) + f1 = self.monitor_metrics(logits, labels)["f1"] + self.log("val_loss", loss, logger=True, prog_bar=True) + self.log("val_acc", correct_count.float() / len(labels), logger=True, prog_bar=True) + if self.category: + self.log("val_f1_key0", f1[0], logger=True, prog_bar=True) + self.log("val_f1_key1", f1[1], logger=True, prog_bar=True) + self.log("val_f1_key2", f1[2], logger=True, prog_bar=True) + else: + self.log("val_f1", f1, logger=True, prog_bar=True) + + def test_step(self, batch, batch_idx): + ids, mask, token_type_ids, labels = batch['ids'], batch['mask'], batch['token_type_ids'], batch['targets'] + logits, correct_count = self.forward(ids, mask, token_type_ids, labels) + loss = self.loss(logits, labels.long()) + f1 = self.monitor_metrics(logits, labels)["f1"] + self.log("test_loss", loss, logger=True, prog_bar=True) + self.log("test_acc", correct_count.float() / len(labels), logger=True, prog_bar=True) + if self.category: + self.log("test_f1_key0", f1[0], logger=True, prog_bar=True) + self.log("test_f1_key1", f1[1], logger=True, prog_bar=True) + self.log("test_f1_key2", f1[2], logger=True, prog_bar=True) + else: + self.log("test_f1", f1, logger=True, prog_bar=True) + return {"test_loss": loss, "logits": logits, "labels": labels} + + def predict_step(self, batch, batch_idx, dataloader_idx): + ids, mask, token_type_ids, id = batch['ids'], batch['mask'], batch['token_type_ids'], batch['id'] + logits = self.predict(ids, mask, token_type_ids) + return {'id': id.cpu().numpy().tolist(), 'logits': logits.cpu().numpy().tolist()} diff --git a/fengshen/examples/clue_sim/loss.py b/fengshen/examples/clue_sim/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..537e2347f65aa952b0eb852c23a39901b0fef52e --- /dev/null +++ b/fengshen/examples/clue_sim/loss.py @@ -0,0 +1,77 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from torch.nn import functional as F + + +class FocalLoss(torch.nn.Module): + """Multi-class Focal loss implementation""" + + def __init__(self, gamma=2, weight=None, ignore_index=-100): + super(FocalLoss, self).__init__() + self.gamma = gamma + self.weight = weight + self.ignore_index = ignore_index + + def forward(self, input, target): + """ + input: [N, C] + target: [N, ] + """ + logpt = F.log_softmax(input, dim=1) + pt = torch.exp(logpt) + logpt = (1-pt)**self.gamma * logpt + loss = F.nll_loss(logpt, target, self.weight, ignore_index=self.ignore_index) + return loss + +# 交叉熵平滑滤波 防止过拟合 + + +class LabelSmoothingCorrectionCrossEntropy(torch.nn.Module): + def __init__(self, eps=0.1, reduction='mean', ignore_index=-100): + super(LabelSmoothingCorrectionCrossEntropy, self).__init__() + self.eps = eps + self.reduction = reduction + self.ignore_index = ignore_index + + def forward(self, output, target): + c = output.size()[-1] + log_preds = F.log_softmax(output, dim=-1) + if self.reduction == 'sum': + loss = -log_preds.sum() + else: + loss = -log_preds.sum(dim=-1) + if self.reduction == 'mean': + loss = loss.mean() + + # task specific + labels_hat = torch.argmax(output, dim=1) + lt_sum = labels_hat + target + abs_lt_sub = abs(labels_hat - target) + correction_loss = 0 + for i in range(c): + if lt_sum[i] == 0: + pass + elif lt_sum[i] == 1: + if abs_lt_sub[i] == 1: + pass + else: + correction_loss -= self.eps*(0.5945275813408382) + else: + correction_loss += self.eps*(1/0.32447699714575207) + correction_loss /= c + # print(correction_loss) + return loss*self.eps/c + (1-self.eps) * \ + F.nll_loss(log_preds, target, reduction=self.reduction, ignore_index=self.ignore_index) + correction_loss diff --git a/fengshen/examples/clue_sim/main.py b/fengshen/examples/clue_sim/main.py new file mode 100644 index 0000000000000000000000000000000000000000..91c5a732d8cb1a683aa34a3b3f7c158861cd4492 --- /dev/null +++ b/fengshen/examples/clue_sim/main.py @@ -0,0 +1,133 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import jsonlines +import torch +import pytorch_lightning as pl +from transformers import AutoTokenizer, BertTokenizer +from train_func import CustomDataset, CustomDataModule, CustomModel +import argparse +import os +import gpustat + +if __name__ == '__main__': + my_parser = argparse.ArgumentParser() + my_parser.add_argument( + "--model_path", default="./weights/Erlangshen-MegatronBert-1.3B-Similarity", type=str, required=False) + my_parser.add_argument( + "--model_name", default="IDEA-CCNL/Erlangshen-MegatronBert-1.3B-Similarity", type=str, required=False) + my_parser.add_argument("--max_seq_length", default=64, type=int, required=False) + my_parser.add_argument("--batch_size", default=32, type=int, required=False) + my_parser.add_argument("--val_batch_size", default=64, type=int, required=False) + my_parser.add_argument("--num_epochs", default=10, type=int, required=False) + my_parser.add_argument("--learning_rate", default=4e-5, type=float, required=False) + my_parser.add_argument("--warmup_proportion", default=0.2, type=int, required=False) + my_parser.add_argument("--warmup_step", default=2, type=int, required=False) + my_parser.add_argument("--num_labels", default=3, type=int, required=False) + my_parser.add_argument("--cate_performance", default=False, type=bool, required=False) + my_parser.add_argument("--use_original_pooler", default=True, type=bool, required=False) + my_parser.add_argument("--model_output_path", default='./pl_model', type=str, required=False) + my_parser.add_argument("--mode", type=str, choices=['Train', 'Test'], required=True) + my_parser.add_argument("--predict_model_path", default='./pl_model/', type=str, required=False) + my_parser.add_argument("--test_output_path", default='./submissions', type=str, required=False) + my_parser.add_argument("--optimizer", default='AdamW', type=str, required=False) # ['Adam', 'AdamW'] + # ['StepLR', 'CosineWarmup', 'CosineAnnealingLR'] + my_parser.add_argument("--scheduler", default='CosineWarmup', type=str, required=False) + my_parser.add_argument("--loss_function", default='LSCE_correction', type=str, + required=False) # ['CE', 'Focal', 'LSCE_correction'] + + args = my_parser.parse_args() + + print(args) + gpustat.print_gpustat() + + if 'Erlangshen' in args.model_name: + tokenizer = BertTokenizer.from_pretrained(args.model_name, cache_dir=args.model_path) + else: + tokenizer = AutoTokenizer.from_pretrained(args.model_name, cache_dir=args.model_path) + + seed = 1919 + pl.seed_everything(seed) + + dm = CustomDataModule( + args=args, + tokenizer=tokenizer, + ) + + metric_index = 2 + checkpoint = pl.callbacks.ModelCheckpoint( + save_top_k=1, + verbose=True, + monitor=['val_loss', 'val_acc', 'val_f1'][metric_index], + mode=['min', 'max', 'max'][metric_index] + ) + + lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step") + callbacks = [checkpoint, lr_monitor] + + logger = pl.loggers.TensorBoardLogger(save_dir=os.getcwd(), + name='lightning_logs/' + args.model_name.split('/')[-1]), + + trainer = pl.Trainer( + progress_bar_refresh_rate=50, + logger=logger, + gpus=-1 if torch.cuda.is_available() else None, + amp_backend='native', + amp_level='O2', + precision=16, + callbacks=callbacks, + gradient_clip_val=1.0, + max_epochs=args.num_epochs, + # accelerator='ddp', + # plugins='ddp_sharded', + ) + + if args.mode == 'Train': + print('Only Train') + model = CustomModel( + args=args, + ) + trainer.fit(model, dm) + + # Predict test, save results to json + if args.mode == 'Test': + print('Only Test') + test_loader = torch.utils.data.DataLoader( + CustomDataset('test.json', tokenizer, args.max_seq_length, 'test'), + batch_size=args.val_batch_size, + num_workers=4, + shuffle=False, + pin_memory=True, + drop_last=False + ) + + model = CustomModel(args=args).load_from_checkpoint(args.predict_model_path, args=args) + + predict_results = trainer.predict(model, test_loader, return_predictions=True) + + path = os.path.join( + args.test_output_path, + args.model_name.split('/')[-1].replace('-', '_')) + file_path = os.path.join(path, 'qbqtc_predict.json') + + if not os.path.exists(path): + os.makedirs(path) + if os.path.exists(file_path): + print('Json文件已存在, 将用本次结果替换') + + with jsonlines.open(file_path, 'w') as jsonf: + for predict_res in predict_results: + for i, p in zip(predict_res['id'], predict_res['logits']): + jsonf.write({"id": i, "label": str(p)}) + print('Json saved:', file_path) diff --git a/fengshen/examples/deepVAE/pretrain_deep_vae.py b/fengshen/examples/deepVAE/pretrain_deep_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..37884261d487b6d43e1c682f15b7fde6e3beb709 --- /dev/null +++ b/fengshen/examples/deepVAE/pretrain_deep_vae.py @@ -0,0 +1,141 @@ +import torch +import os +import random +import math +import argparse +from fengshen.data.fs_datasets.fs_datamodule import FSDataModule +from fengshen.example.deepVAE.vae_pl_module import DeepVAEModule + +from pytorch_lightning import ( + Trainer, + loggers, +) + +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor +from torch.nn.utils.rnn import pad_sequence + + +class NER_RE_Collator: + def __init__(self, bos_token, eos_token, sep_token) -> None: + self.bos_token = bos_token + self.eos_token = eos_token + self.sep_token = sep_token + + def __call__(self, samples, max_len=128): + # when len(samples) is larger than one, we need to save the sentence length info + inputs_tensors, entity_tensors = [], [] + for sp in samples: + # NOTE: in TD-VAE, both encoder and decoder are gpt2, thus use decoder sent twice ! + input_entities, input_ids = sp['decoder_entities'], sp['decoder_target'] + input_entities = input_entities[:max_len] + [self.sep_token] + # shorten input_ids, based on the fact that sentence must be longer than the entities + input_ids = [self.bos_token] + input_ids[:max_len] + [self.eos_token] + entity_tensors.append(torch.tensor(input_entities, dtype=torch.long)) + inputs_tensors.append(torch.tensor(input_ids, dtype=torch.long)) + if not inputs_tensors or not entity_tensors: + return None # if all the examples in the batch exceed max_length sentence + inputs_tensors = pad_sequence(inputs_tensors, batch_first=True, padding_value=0) + entity_tensors = pad_sequence(entity_tensors, batch_first=True, padding_value=0) + return inputs_tensors, entity_tensors + + +class TDVAECollator: + def __init__(self, bos_token, eos_token) -> None: + self.bos_token = bos_token + self.eos_token = eos_token + + def __call__(self, samples, max_len=120): + # when len(samples) is larger than one, we need to save the sentence length info + inputs = [] + for sp in samples: + # NOTE: in TD-VAE, both encoder and decoder are gpt2, thus use decoder sent twice ! + sent_lengths, input_ids = sp['decoder_sent_lengths'], sp['decoder_target'] + potential_indices = [idx for idx, slen in enumerate(sent_lengths) if slen < max_len] + if len(potential_indices) == 0: + continue # we ignore paragraphs with only one sentence split + selected_idx = random.choice(potential_indices) + start_pos, end_pos = sum(sent_lengths[:selected_idx]), sum(sent_lengths[:selected_idx])+sent_lengths[selected_idx] + selected_input_ids = [self.bos_token] + input_ids[start_pos:end_pos] + [self.eos_token] + inputs.append(torch.tensor(selected_input_ids, dtype=torch.long)) + if not inputs: + return None # if all the examples in the batch exceed max_length sentence + inputs = pad_sequence(inputs, batch_first=True, padding_value=0) + return inputs + + +class ZH_Fin_Collator: + def __init__(self, bos_token, eos_token) -> None: + self.bos_token = bos_token + self.eos_token = eos_token + + def __call__(self, samples, max_len=120): + inputs = [] + for sp in samples: + # NOTE: in TD-VAE, both encoder and decoder are gpt2, thus use decoder sent twice ! + input_ids = sp['input_ids'] + if len(input_ids) == 0: + continue # we ignore paragraphs with empty string + selected_input_ids = [self.bos_token] + input_ids + [self.eos_token] + inputs.append(torch.tensor(selected_input_ids, dtype=torch.long)) + if not inputs: + return None + inputs = pad_sequence(inputs, batch_first=True, padding_value=0) + return inputs + + +class VAEModelCheckpoint: + @ staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='total_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./log/', type=str) + parser.add_argument('--filename', default='model-{epoch:2d}-{train_loss:.4f}', type=str) + + parser.add_argument('--save_top_k', default=-1, type=int) + parser.add_argument('--every_n_train_steps', default=1000, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + @staticmethod + def get_callback(args): + return ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename) + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + + args_parser = FSDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = DeepVAEModule.add_module_specific_args(args_parser) + args_parser = VAEModelCheckpoint.add_argparse_args(args_parser) + + args = args_parser.parse_args() + # TODO: update this to be tokenizer specific + # collator = NER_RE_Collator(bos_token=21128, eos_token=21129, sep_token=102) + # collator = TDVAECollator(bos_token=21128, eos_token=21129) + collator = ZH_Fin_Collator(bos_token=21128, eos_token=21129) + + data_module = FSDataModule(args=args, collate_fn=collator) + + train_steps = math.ceil(len(data_module.train_dataset)*args.max_epochs / + args.train_batchsize / args.num_nodes / args.gpus) + model = DeepVAEModule(args, train_steps) + + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'logs/'), name='deepvae_lightning') + + save_cpt_callback = VAEModelCheckpoint.get_callback(args) + lr_monitor = LearningRateMonitor(logging_interval='step') + trainer = Trainer.from_argparse_args(args, + callbacks=[save_cpt_callback, lr_monitor], + logger=logger) + trainer.fit(model, data_module) diff --git a/fengshen/examples/deepVAE/pretrain_deep_vae.sh b/fengshen/examples/deepVAE/pretrain_deep_vae.sh new file mode 100644 index 0000000000000000000000000000000000000000..29967a73689777dd2240bd5916c843f62913b5e3 --- /dev/null +++ b/fengshen/examples/deepVAE/pretrain_deep_vae.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +#SBATCH --job-name=deep_vae_pretrain +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 # +#SBATCH --gres=gpu:1 # number of gpus +#SBATCH -o xxx/outputs/deep_vae/logs/slurm/%x-%j.log +#SBATCH -e xxx/outputs/deep_vae/logs/slurm/%x-%j.err +# SBATCH --requeue +# SBATCH --qos=preemptive + +set -x -e + +ulimit -s unlimited +echo "START TIME: $(date)" + +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +# export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=$[RANDOM%10000+50000] + +MICRO_BATCH_SIZE=64 +ZERO_STAGE=0 + +ROOT_PATH=xxxx +config_json=${ROOT_PATH}/job_out/ds_config.json + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-6, + "warmup_max_lr": 1e-5 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=~/tmp + +# NOTE both encoder and decoder use the same model +GPT2_MODEL_PATH=xxx +VAE_ARGS=" + --gpt2_model_path $GPT2_MODEL_PATH \ + --latent_dim 32 \ + --beta_kl_constraints_start 1e-5 \ + --beta_kl_constraints_stop 1. \ + --beta_n_cycles 40 \ +" + + +CHECKPOINT_SAVE_PATH=${ROOT_PATH}/checkpoints +MODEL_CHECKPOINT_ARGS="\ + --monitor val_recon_loss \ + --save_top_k 1 \ + --mode min \ + --every_n_train_steps 1000 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_SAVE_PATH \ + --filename checkpoint-{epoch}-{step}-filenum_20_dim_32_beta_1e-5_1_zh_finance \ + " + +TRAINER_ARGS=" + --max_epochs 40 \ + --gpus 1 \ + --num_nodes 1 \ + --precision 16 \ + --val_check_interval 1000 \ + --learning_rate 5e-5 \ + --warmup_steps 10000 \ + --weight_decay 0.01 \ + --default_root_dir ${ROOT_PATH} \ + --log_every_n_steps 50 \ + --strategy deepspeed_stage_2 \ +" +# --strategy deepspeed_stage_2 \ + +# note we use wudao optimus instead of recreating a deepVAE dataset +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --eval_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --num_workers 32 \ + --ds_name zh_finance +" +# --ds_name wudao_tdvae, ner_re_data, zh_finance +# --CVAE +SCRIPTS_PATH=xxx/fengshen/examples/pretrain_vae + +export CMD=" \ + $SCRIPTS_PATH/pretrain_deep_vae.py \ + $TRAINER_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $VAE_ARGS \ + $DATA_ARGS \ + " +# srun python $CMD +# python -m debugpy --listen 5678 --wait-for-client $CMD +python $CMD \ No newline at end of file diff --git a/fengshen/examples/deepVAE/vae_pl_module.py b/fengshen/examples/deepVAE/vae_pl_module.py new file mode 100644 index 0000000000000000000000000000000000000000..15a7ebdf52983f5266cf446b2c9c83c994f7a4f7 --- /dev/null +++ b/fengshen/examples/deepVAE/vae_pl_module.py @@ -0,0 +1,278 @@ +# coding=utf-8 +# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Della model. """ + +import os +import torch +import numpy as np +from fengshen.models.deepVAE.deep_vae import DeepVAE +from pytorch_lightning.core.lightning import LightningModule +from transformers.models.gpt2.configuration_gpt2 import GPT2Config +from transformers.models.bert.tokenization_bert import BertTokenizer +from fengshen.models.deepVAE.latent_connector import GPT2ForDecoderLatentConnector, GPT2ForEncoderLatentConnector +from transformers.optimization import AdamW, get_linear_schedule_with_warmup + + +class DeepVAEModule(LightningModule): + @classmethod + def add_module_specific_args(cls, parser): + group = parser.add_argument_group('vae', 'configurations') + group.add_argument("--checkpoint_path", type=str, default=None) + group.add_argument("--gpt2_model_path", type=str) + group.add_argument("--beta_kl_constraints_start", default=1, type=float, + help="min beta for all the latent z posterior vs prior kl loss") + group.add_argument("--beta_kl_constraints_stop", default=1, type=float, + help="max beta for all the latent z posterior vs prior kl loss") + group.add_argument("--beta_n_cycles", default=30, type=int, + help="number of cycles for kl loss ratio within an epoch") + group.add_argument("--freebit_kl_constraints", default=.1, type=float, + help="free bit for all the latent z kl loss") + group.add_argument("--latent_dim", default=256, type=int, + help="latent dimension of deepVAE Z") + group.add_argument("--learning_rate", default=5e-5, type=float, + help="The initial learning rate for Adam.") + group.add_argument("--weight_decay", default=0.0, type=float, + help="Weight deay if we apply some.") + group.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for Adam optimizer.") + group.add_argument("--max_grad_norm", default=1.0, type=float, + help="Max gradient norm.") + group.add_argument("--warmup_steps", default=0, type=int, + help="Linear warmup over warmup_steps.") + group.add_argument("--CVAE", action='store_true', + help="specify this argument if finetuning CVAE, otherwise ignore this argument") + + return parser + + @classmethod + def load_model(cls, args, labels_dict=None): + checkpoint = torch.load(os.path.join(args.checkpoint_path, 'mp_rank_00_model_states.pt')) + + latent_dim = checkpoint['latent_dim'] if ('latent_dim' in checkpoint.keys()) else args.latent_dim + labels_dict = checkpoint['label_dict'] if ('label_dict' in checkpoint.keys()) else labels_dict + + enc_config = GPT2Config.from_pretrained(args.gpt2_model_path) + tokenizer = BertTokenizer.from_pretrained(args.gpt2_model_path) + special_tokens_dict = {'bos_token': '', 'eos_token': ''} + # special_tokens_dict = {'bos_token': '', 'eos_token': '', 'additional_special_tokens': ['', '']} + tokenizer.add_special_tokens(special_tokens_dict) + encoder_model = GPT2ForEncoderLatentConnector(config=enc_config) + encoder_model.resize_token_embeddings(len(tokenizer)) + + dec_config = GPT2Config.from_pretrained(args.gpt2_model_path) + decoder_model = GPT2ForDecoderLatentConnector(config=dec_config, latent_dim=latent_dim) + decoder_model.resize_token_embeddings(len(tokenizer)) + + vae_model = DeepVAE(encoder_model, decoder_model, latent_dim=latent_dim, + hidden_dim=enc_config.hidden_size, layer_num=enc_config.num_hidden_layers, + pad_token_id=tokenizer.pad_token_id, unk_token_id=tokenizer.unk_token_id, + bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, + CVAE=args.CVAE) + + # TODO: all the related params should be loaded here! Including latent_nets, posterior_nets, prior_nets, pooling, decoder.transformer.Wv, decoder.transformer.Wz + anchor = 'module.model.' + start = len(anchor) + vae_dict = {key[start:]: val for key, val in checkpoint['module'].items() if anchor in key} + # comment out if not initialized from VAE + # if args.CVAE: + # # manually load prior and posterior if initialize CVAE model for the first time because of dim mismatch + # prior_post_dict = {key: vae_dict.pop(key) for key in list(vae_dict) if ('posterior_nets' in key or 'prior_nets' in key)} + # for idx in range(enc_config.num_hidden_layers): + # vae_model.posterior_nets[idx].weight.data[:, enc_config.hidden_size:] = prior_post_dict[f"posterior_nets.{idx}.weight"] + # vae_model.prior_nets[idx].weight.data[:, enc_config.hidden_size:] = prior_post_dict[f"prior_nets.{idx}.weight"] + # enc_wte_shape, dec_wte_shape = vae_dict['encoder.transformer.wte.weight'].shape[0], vae_dict['decoder.transformer.wte.weight'].shape[0] + # vae_model.encoder.transformer.wte.weight.data[:enc_wte_shape, :] = vae_dict.pop('encoder.transformer.wte.weight') + # vae_model.decoder.transformer.wte.weight.data[:dec_wte_shape, :] = vae_dict.pop('decoder.transformer.wte.weight') + # vae_model.decoder.lm_head.weight.data[:dec_wte_shape, :] = vae_dict.pop('decoder.lm_head.weight') + missing_keys, unexpected_keys = vae_model.load_state_dict(vae_dict, strict=False) + print(f"Vae model loading process: missing keys {missing_keys}, unexpected keys {unexpected_keys}") + + return vae_model, tokenizer + + def __init__( + self, + args, + train_steps=0, + labels_dict=None + ): + super().__init__() + # self.save_hyperparameters() + self.args = args + + if args.checkpoint_path is not None: + self.model, self.encoder_tokenizer, self.decoder_tokenizer, self.latent_dim, \ + self.labels_dict, self.args = DeepVAEModule.load_model(self.args, labels_dict=labels_dict) + else: + self.encoder_tokenizer = BertTokenizer.from_pretrained(self.args.encoder_model_path) + encoder_config = GPT2Config.from_pretrained(self.args.encoder_model_path) + special_tokens_dict = {'bos_token': '', 'eos_token': '', 'additional_special_tokens': ['', '']} + self.encoder_tokenizer.add_special_tokens(special_tokens_dict) + self.latent_dim = self.args.latent_dim + encoder = GPT2ForEncoderLatentConnector.from_pretrained(self.args.encoder_model_path, config=encoder_config) + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + encoder.resize_token_embeddings(len(self.encoder_tokenizer)) + + self.decoder_tokenizer = BertTokenizer.from_pretrained(self.args.decoder_model_path) + self.decoder_tokenizer.add_special_tokens(special_tokens_dict) + decoder_config = GPT2Config.from_pretrained(self.args.decoder_model_path) + self.labels_dict = labels_dict + decoder = GPT2ForDecoderLatentConnector.from_pretrained(self.args.decoder_model_path, config=decoder_config, + latent_dim=self.latent_dim) + + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + decoder.resize_token_embeddings(len(self.decoder_tokenizer)) + self.model = DeepVAE(encoder, decoder, latent_dim=self.args.latent_dim, + hidden_dim=encoder_config.hidden_size, layer_num=encoder_config.num_hidden_layers, + pad_token_id=self.decoder_tokenizer.pad_token_id, unk_token_id=self.decoder_tokenizer.unk_token_id, + bos_token_id=self.decoder_tokenizer.bos_token_id, eos_token_id=self.decoder_tokenizer.eos_token_id, + CVAE=args.CVAE) + + self.train_steps = train_steps + # TODO: adjust the cyclic schedule + self.beta_kl_constraints_list = self.get_cyclic_linear_beta_list(self.train_steps, + start=args.beta_kl_constraints_start, stop=args.beta_kl_constraints_stop, n_cycle=args.beta_n_cycles) + # self.mlm_probability_list = self.get_decoder_beta_list(self.train_steps, + # start=0., stop=1., n_cycle=args.beta_n_cycles) + # self.beta_kl_constraints_list = self.get_constant_ratio(self.train_steps, args.beta_kl_constraints) + self.mlm_probability_list = self.get_constant_ratio(self.train_steps, 0.) + # self.freebit_kl_constraints = args.freebit_kl_constraints + + def get_constant_ratio(self, n_steps, ratio): + L = np.ones(n_steps) + L *= ratio + return L + + def get_decoder_beta_list(self, n_steps, start=0., stop=1.0, n_cycle=4): + L = np.ones(n_steps) + t_range = int(n_steps / n_cycle) + for t_cur in range(n_steps): + if t_cur > t_range: + L[t_cur] = 0. + else: + ratio = t_cur / t_range + value = stop - ratio * (stop-start) + L[t_cur] = value + return L + + def get_cyclic_linear_beta_list(self, n_steps, start=0.5, stop=1.0, n_cycle=4): + L = np.ones(n_steps) + t_range = int(n_steps / n_cycle) + for t_cur in range(n_steps): + loc = t_cur % t_range + split_range = int(t_range * 0.25) + if loc <= 2*split_range: + value = start + elif loc <= 3*split_range: + ratio = (loc % split_range) / split_range + value = ratio * (stop-start) + else: + value = stop + L[t_cur] = value + return L + + ##### + # Torch lightning + ##### + + def on_save_checkpoint(self, checkpoint) -> None: + checkpoint['label_dict'] = self.labels_dict + checkpoint['latent_dim'] = self.latent_dim + + def training_step(self, batch, batch_idx): + if batch is None: + loss = torch.Tensor([0.]).to(next(self.model.parameters()).device) + loss.requires_grad = True + return loss + inputs, cond_inputs = batch, None + if self.args.CVAE: + inputs, cond_inputs = batch + + total_loss, rec_loss, total_kl_loss, layer_kl_loss = \ + self.model(inputs, self.beta_kl_constraints_list[batch_idx], cond_inputs) + # the logging interval are set by the trainer_args log_every_n_steps + for idx, pg in enumerate(self.optimizers().param_groups): + self.log(f"learning_rate_{idx}", pg['lr']) + unscaled_kl_constraint_loss = 0. if self.beta_kl_constraints_list[batch_idx] == 0. else total_kl_loss/self.beta_kl_constraints_list[batch_idx] + self.log("total_loss", total_loss) + self.log("total_kl_constraint_loss", total_kl_loss) + self.log("unscaled_kl_constraint_loss", unscaled_kl_constraint_loss) + self.log("beta_kl_constraints", self.beta_kl_constraints_list[batch_idx]) + self.log("beta_mlm_probability", self.mlm_probability_list[batch_idx]) + self.log("rec_loss", rec_loss) + for idx, kl_loss in enumerate(layer_kl_loss): + self.log(f"layer_{idx}_kl_loss", kl_loss.mean()) + + return total_loss + + def training_step_end(self, batch_parts): + pass + + def training_epoch_end(self, outputs): + pass + + def validation_step(self, batch, batch_idx): + if batch is None: + loss = torch.Tensor([0.]).to(next(self.model.parameters()).device) + loss.requires_grad = True + return loss + inputs, cond_inputs = batch, None + if self.args.CVAE: + inputs, cond_inputs = batch + + total_loss, rec_loss, total_kl_loss, layer_kl_loss = self.model(inputs, 1., cond_inputs) + # the logging interval are set by the trainer_args log_every_n_steps + self.log("val_total_loss", total_loss) + self.log("val_kl_constraint_loss", total_kl_loss) + self.log("val_recon_loss", rec_loss) + for idx, kl_loss in enumerate(layer_kl_loss): + self.log(f"layer_{idx}_kl_loss", kl_loss.mean()) + return total_loss + + def validation_epoch_end(self, outputs): + pass + + def test_step(self, batch, batch_idx): + if batch is None: + loss = torch.Tensor([0.]).to(next(self.model.parameters()).device) + loss.requires_grad = True + return loss + inputs, cond_inputs = batch, None + if self.args.CVAE: + inputs, cond_inputs = batch + total_loss, rec_loss, total_kl_loss, layer_kl_loss = self.model(inputs, 1., cond_inputs) + self.log("test_total_loss", total_loss) + self.log("test_recon_loss", rec_loss) + self.log("test_kl_constraint_loss", total_kl_loss) + for idx, kl_loss in enumerate(layer_kl_loss): + self.log(f"layer_{idx}_kl_loss", kl_loss.mean()) + return total_loss + + def configure_optimizers(self): + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.weight_decay}, + {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + + optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) + scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.train_steps) + + return {'optimizer': optimizer, + 'lr_scheduler': { + 'scheduler': scheduler, + 'interval': 'step', + 'frequency': 1 + } + } diff --git a/fengshen/examples/disco_project/README.md b/fengshen/examples/disco_project/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c8d95f886e1d80fd1e198eb8d0618c77b6f8836d --- /dev/null +++ b/fengshen/examples/disco_project/README.md @@ -0,0 +1,18 @@ +# Chinese Warp For Disco Diffusion +- This is a chinese version disco diffusion. We train a Chinese CLIP [IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese) and utilize it to guide the diffusion process. +- This code is modified from https://github.com/alembics/disco-diffusion +- streamlit demo is supported. +- the checkpoint has been upload to hugging face. +## Usage + +- Install the lack package directly +### Run Directly +``` +python disco.py --prompt 夕阳西下 --model_path IDEA-CCNL/Taiyi-Diffusion-532M-Nature # or IDEA-CCNL/Taiyi-Diffusion-532M-Cyberpunk +``` + +### Streamlit Setup +``` +streamlit run st_disco.py +# --server.port=xxxx --server.address=xxxx +``` diff --git a/fengshen/examples/disco_project/disco.py b/fengshen/examples/disco_project/disco.py new file mode 100644 index 0000000000000000000000000000000000000000..8c8b516f13311c9797ea27fa6410361a5dfa715a --- /dev/null +++ b/fengshen/examples/disco_project/disco.py @@ -0,0 +1,735 @@ +import os +import sys +# sys.path.insert(0, f'{PROJECT_DIR}/guided-diffusion') # 加在前面,不再读取库文件的东西。 +import subprocess +import io +import torch.nn as nn +from torch.nn import functional as F +import torch +import torchvision.transforms.functional as TF +import torchvision.transforms as T +import math +import requests +import cv2 +from resize_right import resize +from guided_diffusion.guided_diffusion.script_util import model_and_diffusion_defaults +from types import SimpleNamespace +from PIL import Image +import argparse +from guided_diffusion.guided_diffusion.unet import HFUNetModel +from tqdm.notebook import tqdm +from datetime import datetime +from guided_diffusion.guided_diffusion.script_util import create_model_and_diffusion +import clip +from transformers import BertForSequenceClassification, BertTokenizer +import gc +import random + + +# ======================== GLOBAL SETTING ======================== +PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) + +useCPU = False # @param {type:"boolean"} +skip_augs = False # @param{type: 'boolean'} +perlin_init = False # @param{type: 'boolean'} + +use_secondary_model = False +diffusion_model = "custom" + +# Dimensions must by multiples of 64. +side_x = 512 +side_y = 512 + +diffusion_sampling_mode = 'ddim' # @param ['plms','ddim'] +use_checkpoint = True # @param {type: 'boolean'} +ViTB32 = False # @param{type:"boolean"} +ViTB16 = False # @param{type:"boolean"} +ViTL14 = True # @param{type:"boolean"} +ViTL14_336px = False # @param{type:"boolean"} +RN101 = False # @param{type:"boolean"} +RN50 = False # @param{type:"boolean"} +RN50x4 = False # @param{type:"boolean"} +RN50x16 = False # @param{type:"boolean"} +RN50x64 = False # @param{type:"boolean"} + + +# @markdown #####**OpenCLIP settings:** +ViTB32_laion2b_e16 = False # @param{type:"boolean"} +ViTB32_laion400m_e31 = False # @param{type:"boolean"} +ViTB32_laion400m_32 = False # @param{type:"boolean"} +ViTB32quickgelu_laion400m_e31 = False # @param{type:"boolean"} +ViTB32quickgelu_laion400m_e32 = False # @param{type:"boolean"} +ViTB16_laion400m_e31 = False # @param{type:"boolean"} +ViTB16_laion400m_e32 = False # @param{type:"boolean"} +RN50_yffcc15m = False # @param{type:"boolean"} +RN50_cc12m = False # @param{type:"boolean"} +RN50_quickgelu_yfcc15m = False # @param{type:"boolean"} +RN50_quickgelu_cc12m = False # @param{type:"boolean"} +RN101_yfcc15m = False # @param{type:"boolean"} +RN101_quickgelu_yfcc15m = False # @param{type:"boolean"} + +# @markdown ####**Basic Settings:** + +# NOTE steps可以改这里,需要重新初始化模型,我懒得改接口了orz +steps = 100 # @param [25,50,100,150,250,500,1000]{type: 'raw', allow-input: true} +tv_scale = 0 # @param{type: 'number'} +range_scale = 150 # @param{type: 'number'} +sat_scale = 0 # @param{type: 'number'} +cutn_batches = 1 # @param{type: 'number'} # NOTE 这里会对图片做数据增强,累计计算n次CLIP的梯度,以此作为guidance。 +skip_augs = False # @param{type: 'boolean'} +# @markdown ####**Saving:** + +intermediate_saves = 0 # @param{type: 'raw'} +intermediates_in_subfolder = True # @param{type: 'boolean'} + +# perlin_init = False # @param{type: 'boolean'} +perlin_mode = 'mixed' # @param ['mixed', 'color', 'gray'] +set_seed = 'random_seed' # @param{type: 'string'} +eta = 0.8 # @param{type: 'number'} +clamp_grad = True # @param{type: 'boolean'} +clamp_max = 0.05 # @param{type: 'number'} + +# EXTRA ADVANCED SETTINGS: +randomize_class = True +clip_denoised = False +fuzzy_prompt = False +rand_mag = 0.05 + +# @markdown --- +cut_overview = "[12]*400+[4]*600" # @param {type: 'string'} +cut_innercut = "[4]*400+[12]*600" # @param {type: 'string'} +cut_ic_pow = "[1]*1000" # @param {type: 'string'} +cut_icgray_p = "[0.2]*400+[0]*600" # @param {type: 'string'} + + +# @markdown ####**Transformation Settings:** +use_vertical_symmetry = False # @param {type:"boolean"} +use_horizontal_symmetry = False # @param {type:"boolean"} +transformation_percent = [0.09] # @param + +display_rate = 3 # @param{type: 'number'} +n_batches = 1 # @param{type: 'number'} + +# @markdown If you're having issues with model downloads, check this to compare SHA's: +check_model_SHA = False # @param{type:"boolean"} +interp_spline = 'Linear' # Do not change, currently will not look good. param ['Linear','Quadratic','Cubic']{type:"string"} +resume_run = False +batch_size = 1 + + +def createPath(filepath): + os.makedirs(filepath, exist_ok=True) + + +def wget(url, outputdir): + res = subprocess.run(['wget', url, '-P', f'{outputdir}'], stdout=subprocess.PIPE).stdout.decode('utf-8') + print(res) + + +def alpha_sigma_to_t(alpha, sigma): + return torch.atan2(sigma, alpha) * 2 / math.pi + + +def interp(t): + return 3 * t**2 - 2 * t ** 3 + + +def perlin(width, height, scale=10, device=None): + gx, gy = torch.randn(2, width + 1, height + 1, 1, 1, device=device) + xs = torch.linspace(0, 1, scale + 1)[:-1, None].to(device) + ys = torch.linspace(0, 1, scale + 1)[None, :-1].to(device) + wx = 1 - interp(xs) + wy = 1 - interp(ys) + dots = 0 + dots += wx * wy * (gx[:-1, :-1] * xs + gy[:-1, :-1] * ys) + dots += (1 - wx) * wy * (-gx[1:, :-1] * (1 - xs) + gy[1:, :-1] * ys) + dots += wx * (1 - wy) * (gx[:-1, 1:] * xs - gy[:-1, 1:] * (1 - ys)) + dots += (1 - wx) * (1 - wy) * (-gx[1:, 1:] * (1 - xs) - gy[1:, 1:] * (1 - ys)) + return dots.permute(0, 2, 1, 3).contiguous().view(width * scale, height * scale) + + +def perlin_ms(octaves, width, height, grayscale, device=None): + out_array = [0.5] if grayscale else [0.5, 0.5, 0.5] + # out_array = [0.0] if grayscale else [0.0, 0.0, 0.0] + for i in range(1 if grayscale else 3): + scale = 2 ** len(octaves) + oct_width = width + oct_height = height + for oct in octaves: + p = perlin(oct_width, oct_height, scale, device) + out_array[i] += p * oct + scale //= 2 + oct_width *= 2 + oct_height *= 2 + return torch.cat(out_array) + + +def fetch(url_or_path): + if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'): + r = requests.get(url_or_path) + r.raise_for_status() + fd = io.BytesIO() + fd.write(r.content) + fd.seek(0) + return fd + return open(url_or_path, 'rb') + + +def read_image_workaround(path): + """OpenCV reads images as BGR, Pillow saves them as RGB. Work around + this incompatibility to avoid colour inversions.""" + im_tmp = cv2.imread(path) + return cv2.cvtColor(im_tmp, cv2.COLOR_BGR2RGB) + + +def parse_prompt(prompt): + if prompt.startswith('http://') or prompt.startswith('https://'): + vals = prompt.rsplit(':', 2) + vals = [vals[0] + ':' + vals[1], *vals[2:]] + else: + vals = prompt.rsplit(':', 1) + vals = vals + ['', '1'][len(vals):] + return vals[0], float(vals[1]) + + +def sinc(x): + return torch.where(x != 0, torch.sin(math.pi * x) / (math.pi * x), x.new_ones([])) + + +def lanczos(x, a): + cond = torch.logical_and(-a < x, x < a) + out = torch.where(cond, sinc(x) * sinc(x / a), x.new_zeros([])) + return out / out.sum() + + +def ramp(ratio, width): + n = math.ceil(width / ratio + 1) + out = torch.empty([n]) + cur = 0 + for i in range(out.shape[0]): + out[i] = cur + cur += ratio + return torch.cat([-out[1:].flip([0]), out])[1:-1] + + +def resample(input, size, align_corners=True): + n, c, h, w = input.shape + dh, dw = size + + input = input.reshape([n * c, 1, h, w]) + + if dh < h: + kernel_h = lanczos(ramp(dh / h, 2), 2).to(input.device, input.dtype) + pad_h = (kernel_h.shape[0] - 1) // 2 + input = F.pad(input, (0, 0, pad_h, pad_h), 'reflect') + input = F.conv2d(input, kernel_h[None, None, :, None]) + + if dw < w: + kernel_w = lanczos(ramp(dw / w, 2), 2).to(input.device, input.dtype) + pad_w = (kernel_w.shape[0] - 1) // 2 + input = F.pad(input, (pad_w, pad_w, 0, 0), 'reflect') + input = F.conv2d(input, kernel_w[None, None, None, :]) + + input = input.reshape([n, c, h, w]) + return F.interpolate(input, size, mode='bicubic', align_corners=align_corners) + + +class MakeCutouts(nn.Module): + def __init__(self, cut_size, cutn, skip_augs=False): + super().__init__() + self.cut_size = cut_size + self.cutn = cutn + self.skip_augs = skip_augs + self.augs = T.Compose([ + T.RandomHorizontalFlip(p=0.5), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomAffine(degrees=15, translate=(0.1, 0.1)), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomPerspective(distortion_scale=0.4, p=0.7), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomGrayscale(p=0.15), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + # T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), + ]) + + def forward(self, input): + input = T.Pad(input.shape[2] // 4, fill=0)(input) + sideY, sideX = input.shape[2:4] + max_size = min(sideX, sideY) + + cutouts = [] + for ch in range(self.cutn): + if ch > self.cutn - self.cutn // 4: + cutout = input.clone() + else: + size = int(max_size * torch.zeros(1,).normal_(mean=.8, std=.3).clip(float(self.cut_size / max_size), 1.)) + offsetx = torch.randint(0, abs(sideX - size + 1), ()) + offsety = torch.randint(0, abs(sideY - size + 1), ()) + cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size] + + if not self.skip_augs: + cutout = self.augs(cutout) + cutouts.append(resample(cutout, (self.cut_size, self.cut_size))) + del cutout + + cutouts = torch.cat(cutouts, dim=0) + return cutouts + + +class MakeCutoutsDango(nn.Module): + def __init__(self, cut_size, args, + Overview=4, + InnerCrop=0, IC_Size_Pow=0.5, IC_Grey_P=0.2, + ): + super().__init__() + self.padargs = {} + self.cutout_debug = False + self.cut_size = cut_size + self.Overview = Overview + self.InnerCrop = InnerCrop + self.IC_Size_Pow = IC_Size_Pow + self.IC_Grey_P = IC_Grey_P + self.augs = T.Compose([ + T.RandomHorizontalFlip(p=0.5), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomAffine(degrees=10, translate=(0.05, 0.05), interpolation=T.InterpolationMode.BILINEAR), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomGrayscale(p=0.1), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1), + ]) + + def forward(self, input): + cutouts = [] + gray = T.Grayscale(3) + sideY, sideX = input.shape[2:4] + max_size = min(sideX, sideY) + min_size = min(sideX, sideY, self.cut_size) + output_shape = [1, 3, self.cut_size, self.cut_size] + pad_input = F.pad(input, ((sideY - max_size) // 2, (sideY - max_size) // 2, (sideX - max_size) // 2, (sideX - max_size) // 2), **self.padargs) + cutout = resize(pad_input, out_shape=output_shape) + + if self.Overview > 0: + if self.Overview <= 4: + if self.Overview >= 1: + cutouts.append(cutout) + if self.Overview >= 2: + cutouts.append(gray(cutout)) + if self.Overview >= 3: + cutouts.append(TF.hflip(cutout)) + if self.Overview == 4: + cutouts.append(gray(TF.hflip(cutout))) + else: + cutout = resize(pad_input, out_shape=output_shape) + for _ in range(self.Overview): + cutouts.append(cutout) + + if self.cutout_debug: + # if is_colab: + # TF.to_pil_image(cutouts[0].clamp(0, 1).squeeze(0)).save("/content/cutout_overview0.jpg",quality=99) + # else: + TF.to_pil_image(cutouts[0].clamp(0, 1).squeeze(0)).save("cutout_overview0.jpg", quality=99) + + if self.InnerCrop > 0: + for i in range(self.InnerCrop): + size = int(torch.rand([])**self.IC_Size_Pow * (max_size - min_size) + min_size) + offsetx = torch.randint(0, sideX - size + 1, ()) + offsety = torch.randint(0, sideY - size + 1, ()) + cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size] + if i <= int(self.IC_Grey_P * self.InnerCrop): + cutout = gray(cutout) + cutout = resize(cutout, out_shape=output_shape) + cutouts.append(cutout) + if self.cutout_debug: + # if is_colab: + # TF.to_pil_image(cutouts[-1].clamp(0, 1).squeeze(0)).save("/content/cutout_InnerCrop.jpg",quality=99) + # else: + TF.to_pil_image(cutouts[-1].clamp(0, 1).squeeze(0)).save("cutout_InnerCrop.jpg", quality=99) + cutouts = torch.cat(cutouts) + if skip_augs is not True: + cutouts = self.augs(cutouts) + return cutouts + + +def spherical_dist_loss(x, y): + x = F.normalize(x, dim=-1) + y = F.normalize(y, dim=-1) + return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) + + +def tv_loss(input): + """L2 total variation loss, as in Mahendran et al.""" + input = F.pad(input, (0, 1, 0, 1), 'replicate') + x_diff = input[..., :-1, 1:] - input[..., :-1, :-1] + y_diff = input[..., 1:, :-1] - input[..., :-1, :-1] + return (x_diff**2 + y_diff**2).mean([1, 2, 3]) + + +def range_loss(input): + return (input - input.clamp(-1, 1)).pow(2).mean([1, 2, 3]) + + +def symmetry_transformation_fn(x): + # NOTE 强制图像对称 + use_horizontal_symmetry = False + if use_horizontal_symmetry: + [n, c, h, w] = x.size() + x = torch.concat((x[:, :, :, :w // 2], torch.flip(x[:, :, :, :w // 2], [-1])), -1) + print("horizontal symmetry applied") + if use_vertical_symmetry: + [n, c, h, w] = x.size() + x = torch.concat((x[:, :, :h // 2, :], torch.flip(x[:, :, :h // 2, :], [-2])), -2) + print("vertical symmetry applied") + return x + + +# def split_prompts(prompts): +# prompt_series = pd.Series([np.nan for a in range(max_frames)]) +# for i, prompt in prompts.items(): +# prompt_series[i] = prompt +# # prompt_series = prompt_series.astype(str) +# prompt_series = prompt_series.ffill().bfill() +# return prompt_series + + +""" +other chaos settings +""" +# dir settings + +outDirPath = f'{PROJECT_DIR}/images_out' +createPath(outDirPath) +model_path = f'{PROJECT_DIR}/models' +createPath(model_path) + + +# GPU setup +DEVICE = torch.device('cuda:0' if (torch.cuda.is_available() and not useCPU) else 'cpu') +print('Using device:', DEVICE) +device = DEVICE # At least one of the modules expects this name.. +if not useCPU: + if torch.cuda.get_device_capability(DEVICE) == (8, 0): # A100 fix thanks to Emad + print('Disabling CUDNN for A100 gpu', file=sys.stderr) + torch.backends.cudnn.enabled = False + +model_config = model_and_diffusion_defaults() +model_config.update({ + 'attention_resolutions': '32, 16, 8', + 'class_cond': False, + 'diffusion_steps': 1000, # No need to edit this, it is taken care of later. + 'rescale_timesteps': True, + 'timestep_respacing': 250, # No need to edit this, it is taken care of later. + 'image_size': 512, + 'learn_sigma': True, + 'noise_schedule': 'linear', + 'num_channels': 256, + 'num_head_channels': 64, + 'num_res_blocks': 2, + 'resblock_updown': True, + 'use_checkpoint': use_checkpoint, + 'use_fp16': not useCPU, + 'use_scale_shift_norm': True, +}) + +model_default = model_config['image_size'] +normalize = T.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) + +# Make folder for batch +steps_per_checkpoint = steps + 10 +# Update Model Settings +timestep_respacing = f'ddim{steps}' +diffusion_steps = (1000 // steps) * steps if steps < 1000 else steps +model_config.update({ + 'timestep_respacing': timestep_respacing, + 'diffusion_steps': diffusion_steps, +}) + + +start_frame = 0 +print('Starting Run:') +if set_seed == 'random_seed': + random.seed() + seed = random.randint(0, 2**32) + # print(f'Using seed: {seed}') +else: + seed = int(set_seed) + +args = { + # 'seed': seed, + 'display_rate': display_rate, + 'n_batches': n_batches, + 'batch_size': batch_size, + 'steps': steps, + 'diffusion_sampling_mode': diffusion_sampling_mode, + # 'width_height': width_height, + 'tv_scale': tv_scale, + 'range_scale': range_scale, + 'sat_scale': sat_scale, + 'cutn_batches': cutn_batches, + # 'side_x': side_x, + # 'side_y': side_y, + 'timestep_respacing': timestep_respacing, + 'diffusion_steps': diffusion_steps, + 'cut_overview': eval(cut_overview), + 'cut_innercut': eval(cut_innercut), + 'cut_ic_pow': eval(cut_ic_pow), + 'cut_icgray_p': eval(cut_icgray_p), + 'intermediate_saves': intermediate_saves, + 'intermediates_in_subfolder': intermediates_in_subfolder, + 'steps_per_checkpoint': steps_per_checkpoint, + 'set_seed': set_seed, + 'eta': eta, + 'clamp_grad': clamp_grad, + 'clamp_max': clamp_max, + 'skip_augs': skip_augs, + 'randomize_class': randomize_class, + 'clip_denoised': clip_denoised, + 'fuzzy_prompt': fuzzy_prompt, + 'rand_mag': rand_mag, + 'use_vertical_symmetry': use_vertical_symmetry, + 'use_horizontal_symmetry': use_horizontal_symmetry, + 'transformation_percent': transformation_percent, +} +args = SimpleNamespace(**args) + +# ======================== GLOBAL SETTING END ======================== + + +class Diffuser: + def __init__(self, cutom_path='IDEA-CCNL/Taiyi-Diffusion-532M-Nature'): + self.model_setup(cutom_path) + + def model_setup(self, custom_path): + # LOADING MODEL + os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' + print(f'Prepping model...model name: {custom_path}') + __, self.diffusion = create_model_and_diffusion(**model_config) + self.model = HFUNetModel.from_pretrained(custom_path) + # total = get_parameter_num(self.model) + # print("Number of parameter: %.2fM" % (total/1e6)) + # print("Number of parameter: %.2fM" % (total/1024/1024)) + + self.model.requires_grad_(False).eval().to(device) + for name, param in self.model.named_parameters(): + if 'qkv' in name or 'norm' in name or 'proj' in name: + param.requires_grad_() + if model_config['use_fp16']: + self.model.convert_to_fp16() + print(f'Diffusion_model Loaded {diffusion_model}') + + # NOTE Directly Load The Text Encoder From Hugging Face + print('Prepping model...model name: CLIP') + self.taiyi_tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese") + self.taiyi_transformer = BertForSequenceClassification.from_pretrained("IDEA-CCNL/Taiyi-CLIP-Roberta-large-326M-Chinese").eval().to(device) + self.clip_models = [] + if ViTB32: + self.clip_models.append(clip.load('ViT-B/32', jit=False)[0].eval().requires_grad_(False).to(device)) + if ViTB16: + self.clip_models.append(clip.load('ViT-B/16', jit=False)[0].eval().requires_grad_(False).to(device)) + if ViTL14: + self.clip_models.append(clip.load('ViT-L/14', jit=False)[0].eval().requires_grad_(False).to(device)) + if ViTL14_336px: + self.clip_models.append(clip.load('ViT-L/14@336px', jit=False)[0].eval().requires_grad_(False).to(device)) + print('CLIP Loaded') + # self.lpips_model = lpips.LPIPS(net='vgg').to(device) + + def generate(self, + input_text_prompts=['夕阳西下'], + init_image=None, + skip_steps=10, + clip_guidance_scale=7500, + init_scale=2000, + st_dynamic_image=None, + seed=None, + side_x=512, + side_y=512, + ): + + seed = seed + frame_num = 0 + init_image = init_image + init_scale = init_scale + skip_steps = skip_steps + loss_values = [] + # if seed is not None: + # np.random.seed(seed) + # random.seed(seed) + # torch.manual_seed(seed) + # torch.cuda.manual_seed_all(seed) + # torch.backends.cudnn.deterministic = True + # target_embeds, weights = [], [] + frame_prompt = input_text_prompts + + print(f'Frame {frame_num} Prompt: {frame_prompt}') + + model_stats = [] + for clip_model in self.clip_models: + # cutn = 16 + model_stat = {"clip_model": None, "target_embeds": [], "make_cutouts": None, "weights": []} + model_stat["clip_model"] = clip_model + + for prompt in frame_prompt: + txt, weight = parse_prompt(prompt) + # txt = clip_model.encode_text(clip.tokenize(prompt).to(device)).float() + # NOTE use chinese CLIP + txt = self.taiyi_transformer(self.taiyi_tokenizer(txt, return_tensors='pt')['input_ids'].to(device)).logits + if args.fuzzy_prompt: + for i in range(25): + model_stat["target_embeds"].append((txt + torch.randn(txt.shape).cuda() * args.rand_mag).clamp(0, 1)) + model_stat["weights"].append(weight) + else: + model_stat["target_embeds"].append(txt) + model_stat["weights"].append(weight) + + model_stat["target_embeds"] = torch.cat(model_stat["target_embeds"]) + model_stat["weights"] = torch.tensor(model_stat["weights"], device=device) + if model_stat["weights"].sum().abs() < 1e-3: + raise RuntimeError('The weights must not sum to 0.') + model_stat["weights"] /= model_stat["weights"].sum().abs() + model_stats.append(model_stat) + + init = None + if init_image is not None: + # init = Image.open(fetch(init_image)).convert('RGB') # 传递的是加载好的图片。而非地址~ + init = init_image + init = init.resize((side_x, side_y), Image.LANCZOS) + init = TF.to_tensor(init).to(device).unsqueeze(0).mul(2).sub(1) + + cur_t = None + + def cond_fn(x, t, y=None): + with torch.enable_grad(): + x_is_NaN = False + x = x.detach().requires_grad_() + n = x.shape[0] + + my_t = torch.ones([n], device=device, dtype=torch.long) * cur_t + out = self.diffusion.p_mean_variance(self.model, x, my_t, clip_denoised=False, model_kwargs={'y': y}) + fac = self.diffusion.sqrt_one_minus_alphas_cumprod[cur_t] + x_in = out['pred_xstart'] * fac + x * (1 - fac) + x_in_grad = torch.zeros_like(x_in) + + for model_stat in model_stats: + for i in range(args.cutn_batches): + t_int = int(t.item()) + 1 # errors on last step without +1, need to find source + # try: + input_resolution = model_stat["clip_model"].visual.input_resolution + # except: + # input_resolution = 224 + + cuts = MakeCutoutsDango(input_resolution, + Overview=args.cut_overview[1000 - t_int], + InnerCrop=args.cut_innercut[1000 - t_int], + IC_Size_Pow=args.cut_ic_pow[1000 - t_int], + IC_Grey_P=args.cut_icgray_p[1000 - t_int], + args=args, + ) + clip_in = normalize(cuts(x_in.add(1).div(2))) + image_embeds = model_stat["clip_model"].encode_image(clip_in).float() + dists = spherical_dist_loss(image_embeds.unsqueeze(1), model_stat["target_embeds"].unsqueeze(0)) + dists = dists.view([args.cut_overview[1000 - t_int] + args.cut_innercut[1000 - t_int], n, -1]) + losses = dists.mul(model_stat["weights"]).sum(2).mean(0) + loss_values.append(losses.sum().item()) # log loss, probably shouldn't do per cutn_batch + x_in_grad += torch.autograd.grad(losses.sum() * clip_guidance_scale, x_in)[0] / cutn_batches + tv_losses = tv_loss(x_in) + range_losses = range_loss(out['pred_xstart']) + sat_losses = torch.abs(x_in - x_in.clamp(min=-1, max=1)).mean() + loss = tv_losses.sum() * tv_scale + range_losses.sum() * range_scale + sat_losses.sum() * sat_scale + if init is not None and init_scale: + init_losses = self.lpips_model(x_in, init) + loss = loss + init_losses.sum() * init_scale + x_in_grad += torch.autograd.grad(loss, x_in)[0] + if not torch.isnan(x_in_grad).any(): + grad = -torch.autograd.grad(x_in, x, x_in_grad)[0] + else: + x_is_NaN = True + grad = torch.zeros_like(x) + if args.clamp_grad and not x_is_NaN: + magnitude = grad.square().mean().sqrt() + return grad * magnitude.clamp(max=args.clamp_max) / magnitude # min=-0.02, min=-clamp_max, + return grad + + if args.diffusion_sampling_mode == 'ddim': + sample_fn = self.diffusion.ddim_sample_loop_progressive + else: + sample_fn = self.diffusion.plms_sample_loop_progressive + + for i in range(args.n_batches): + current_time = datetime.now().strftime('%y%m%d-%H%M%S_%f') + + batchBar = tqdm(range(args.n_batches), desc="Batches") + batchBar.n = i + batchBar.refresh() + gc.collect() + torch.cuda.empty_cache() + cur_t = self.diffusion.num_timesteps - skip_steps - 1 + # total_steps = cur_t + + if args.diffusion_sampling_mode == 'ddim': + samples = sample_fn( + self.model, + (batch_size, 3, side_y, side_x), + clip_denoised=clip_denoised, + model_kwargs={}, + cond_fn=cond_fn, + progress=True, + skip_timesteps=skip_steps, + init_image=init, + randomize_class=randomize_class, + eta=eta, + transformation_fn=symmetry_transformation_fn, + transformation_percent=args.transformation_percent + ) + else: + samples = sample_fn( + self.model, + (batch_size, 3, side_y, side_x), + clip_denoised=clip_denoised, + model_kwargs={}, + cond_fn=cond_fn, + progress=True, + skip_timesteps=skip_steps, + init_image=init, + randomize_class=randomize_class, + order=2, + ) + + for j, sample in enumerate(samples): + cur_t -= 1 + intermediateStep = False + if args.steps_per_checkpoint is not None: + if j % steps_per_checkpoint == 0 and j > 0: + intermediateStep = True + elif j in args.intermediate_saves: + intermediateStep = True + if j % args.display_rate == 0 or cur_t == -1 or intermediateStep: + for k, image in enumerate(sample['pred_xstart']): + # tqdm.write(f'Batch {i}, step {j}, output {k}:') + # percent = math.ceil(j / total_steps * 100) + if args.n_batches > 0: + filename = f'{current_time}-{parse_prompt(prompt)[0]}.png' + image = TF.to_pil_image(image.add(1).div(2).clamp(0, 1)) + if j % args.display_rate == 0 or cur_t == -1: + image.save(f'{outDirPath}/{filename}') + if st_dynamic_image: + st_dynamic_image.image(image, use_column_width=True) + # self.current_image = image + return image + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="setting") + parser.add_argument('--prompt', type=str, required=True) + parser.add_argument('--text_scale', type=int, default=5000) + parser.add_argument('--model_path', type=str, default="IDEA-CCNL/Taiyi-Diffusion-532M-Nature") + parser.add_argument('--width', type=int, default=512) + parser.add_argument('--height', type=int, default=512) + + user_args = parser.parse_args() + + dd = Diffuser(user_args.model_path) + dd.generate([user_args.prompt], + clip_guidance_scale=user_args.text_scale, + side_x=user_args.width, + side_y=user_args.height, + ) diff --git a/fengshen/examples/disco_project/guided_diffusion/.gitignore b/fengshen/examples/disco_project/guided_diffusion/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2d97991d7edb176fa2b5829b9f3ceb6405bdb108 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/.gitignore @@ -0,0 +1,3 @@ +.DS_Store +__pycache__/ + diff --git a/fengshen/examples/disco_project/guided_diffusion/LICENSE b/fengshen/examples/disco_project/guided_diffusion/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9e84fcbc4d81a1f433c90caf9f1cef373c12edae --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/fengshen/examples/disco_project/guided_diffusion/__init__.py b/fengshen/examples/disco_project/guided_diffusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/__init__.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9665a0d63f695eab303318d824dad14041c7cde9 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/__init__.py @@ -0,0 +1,3 @@ +""" +Codebase for "Improved Denoising Diffusion Probabilistic Models". +""" diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/fp16_util.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/fp16_util.py new file mode 100644 index 0000000000000000000000000000000000000000..2c886705ad4dadb1f0b8b0624cc8f9e8d2dab0c9 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/fp16_util.py @@ -0,0 +1,236 @@ +""" +Helpers to train with 16-bit precision. +""" + +import numpy as np +import torch as th +import torch.nn as nn +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + +from . import logger + +INITIAL_LOG_LOSS_SCALE = 20.0 + + +def convert_module_to_f16(ll): + """ + Convert primitive modules to float16. + """ + if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + ll.weight.data = ll.weight.data.half() + if ll.bias is not None: + ll.bias.data = ll.bias.data.half() + + +def convert_module_to_f32(ll): + """ + Convert primitive modules to float32, undoing convert_module_to_f16(). + """ + if isinstance(ll, (nn.Conv1d, nn.Conv2d, nn.Conv3d)): + ll.weight.data = ll.weight.data.float() + if ll.bias is not None: + ll.bias.data = ll.bias.data.float() + + +def make_master_params(param_groups_and_shapes): + """ + Copy model parameters into a (differently-shaped) list of full-precision + parameters. + """ + master_params = [] + for param_group, shape in param_groups_and_shapes: + master_param = nn.Parameter( + _flatten_dense_tensors( + [param.detach().float() for (_, param) in param_group] + ).view(shape) + ) + master_param.requires_grad = True + master_params.append(master_param) + return master_params + + +def model_grads_to_master_grads(param_groups_and_shapes, master_params): + """ + Copy the gradients from the model parameters into the master parameters + from make_master_params(). + """ + for master_param, (param_group, shape) in zip( + master_params, param_groups_and_shapes + ): + master_param.grad = _flatten_dense_tensors( + [param_grad_or_zeros(param) for (_, param) in param_group] + ).view(shape) + + +def master_params_to_model_params(param_groups_and_shapes, master_params): + """ + Copy the master parameter data back into the model parameters. + """ + # Without copying to a list, if a generator is passed, this will + # silently not copy any parameters. + for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes): + for (_, param), unflat_master_param in zip( + param_group, unflatten_master_params(param_group, master_param.view(-1)) + ): + param.detach().copy_(unflat_master_param) + + +def unflatten_master_params(param_group, master_param): + return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group]) + + +def get_param_groups_and_shapes(named_model_params): + named_model_params = list(named_model_params) + scalar_vector_named_params = ( + [(n, p) for (n, p) in named_model_params if p.ndim <= 1], + (-1), + ) + matrix_named_params = ( + [(n, p) for (n, p) in named_model_params if p.ndim > 1], + (1, -1), + ) + return [scalar_vector_named_params, matrix_named_params] + + +def master_params_to_state_dict( + model, param_groups_and_shapes, master_params, use_fp16 +): + if use_fp16: + state_dict = model.state_dict() + for master_param, (param_group, _) in zip( + master_params, param_groups_and_shapes + ): + for (name, _), unflat_master_param in zip( + param_group, unflatten_master_params(param_group, master_param.view(-1)) + ): + assert name in state_dict + state_dict[name] = unflat_master_param + else: + state_dict = model.state_dict() + for i, (name, _value) in enumerate(model.named_parameters()): + assert name in state_dict + state_dict[name] = master_params[i] + return state_dict + + +def state_dict_to_master_params(model, state_dict, use_fp16): + if use_fp16: + named_model_params = [ + (name, state_dict[name]) for name, _ in model.named_parameters() + ] + param_groups_and_shapes = get_param_groups_and_shapes(named_model_params) + master_params = make_master_params(param_groups_and_shapes) + else: + master_params = [state_dict[name] for name, _ in model.named_parameters()] + return master_params + + +def zero_master_grads(master_params): + for param in master_params: + param.grad = None + + +def zero_grad(model_params): + for param in model_params: + # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group + if param.grad is not None: + param.grad.detach_() + param.grad.zero_() + + +def param_grad_or_zeros(param): + if param.grad is not None: + return param.grad.data.detach() + else: + return th.zeros_like(param) + + +class MixedPrecisionTrainer: + def __init__( + self, + *, + model, + use_fp16=False, + fp16_scale_growth=1e-3, + initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE, + ): + self.model = model + self.use_fp16 = use_fp16 + self.fp16_scale_growth = fp16_scale_growth + + self.model_params = list(self.model.parameters()) + self.master_params = self.model_params + self.param_groups_and_shapes = None + self.lg_loss_scale = initial_lg_loss_scale + + if self.use_fp16: + self.param_groups_and_shapes = get_param_groups_and_shapes( + self.model.named_parameters() + ) + self.master_params = make_master_params(self.param_groups_and_shapes) + self.model.convert_to_fp16() + + def zero_grad(self): + zero_grad(self.model_params) + + def backward(self, loss: th.Tensor): + if self.use_fp16: + loss_scale = 2 ** self.lg_loss_scale + (loss * loss_scale).backward() + else: + loss.backward() + + def optimize(self, opt: th.optim.Optimizer): + if self.use_fp16: + return self._optimize_fp16(opt) + else: + return self._optimize_normal(opt) + + def _optimize_fp16(self, opt: th.optim.Optimizer): + logger.logkv_mean("lg_loss_scale", self.lg_loss_scale) + model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params) + grad_norm, param_norm = self._compute_norms(grad_scale=2 ** self.lg_loss_scale) + if check_overflow(grad_norm): + self.lg_loss_scale -= 1 + logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}") + zero_master_grads(self.master_params) + return False + + logger.logkv_mean("grad_norm", grad_norm) + logger.logkv_mean("param_norm", param_norm) + + self.master_params[0].grad.mul_(1.0 / (2 ** self.lg_loss_scale)) + opt.step() + zero_master_grads(self.master_params) + master_params_to_model_params(self.param_groups_and_shapes, self.master_params) + self.lg_loss_scale += self.fp16_scale_growth + return True + + def _optimize_normal(self, opt: th.optim.Optimizer): + grad_norm, param_norm = self._compute_norms() + logger.logkv_mean("grad_norm", grad_norm) + logger.logkv_mean("param_norm", param_norm) + opt.step() + return True + + def _compute_norms(self, grad_scale=1.0): + grad_norm = 0.0 + param_norm = 0.0 + for p in self.master_params: + with th.no_grad(): + param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2 + if p.grad is not None: + grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2 + return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm) + + def master_params_to_state_dict(self, master_params): + return master_params_to_state_dict( + self.model, self.param_groups_and_shapes, master_params, self.use_fp16 + ) + + def state_dict_to_master_params(self, state_dict): + return state_dict_to_master_params(self.model, state_dict, self.use_fp16) + + +def check_overflow(value): + return (value == float("inf")) or (value == -float("inf")) or (value != value) diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/gaussian_diffusion.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/gaussian_diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..51f13385337c0b4ca9f25cb4850eb245904a6443 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/gaussian_diffusion.py @@ -0,0 +1,1316 @@ +""" +This code started out as a PyTorch port of Ho et al's diffusion models: +https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py + +Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules. +""" + +import enum +import math + +import numpy as np +import torch as th + +from .nn import mean_flat +from .losses import normal_kl, discretized_gaussian_log_likelihood + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace( + beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64 + ) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2, + ) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +class ModelMeanType(enum.Enum): + """ + Which type of output the model predicts. + """ + + PREVIOUS_X = enum.auto() # the model predicts x_{t-1} + START_X = enum.auto() # the model predicts x_0 + EPSILON = enum.auto() # the model predicts epsilon + + +class ModelVarType(enum.Enum): + """ + What is used as the model's output variance. + + The LEARNED_RANGE option has been added to allow the model to predict + values between FIXED_SMALL and FIXED_LARGE, making its job easier. + """ + + LEARNED = enum.auto() + FIXED_SMALL = enum.auto() + FIXED_LARGE = enum.auto() + LEARNED_RANGE = enum.auto() + + +class LossType(enum.Enum): + MSE = enum.auto() # use raw MSE loss (and KL when learning variances) + RESCALED_MSE = ( + enum.auto() + ) # use raw MSE loss (with RESCALED_KL when learning variances) + KL = enum.auto() # use the variational lower-bound + RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB + + def is_vb(self): + return self == LossType.KL or self == LossType.RESCALED_KL + + +class GaussianDiffusion: + """ + Utilities for training and sampling diffusion models. + + Ported directly from here, and then adapted over time to further experimentation. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 + + :param betas: a 1-D numpy array of betas for each diffusion timestep, + starting at T and going to 1. + :param model_mean_type: a ModelMeanType determining what the model outputs. + :param model_var_type: a ModelVarType determining how variance is output. + :param loss_type: a LossType determining the loss function to use. + :param rescale_timesteps: if True, pass floating point timesteps into the + model so that they are always scaled like in the + original paper (0 to 1000). + """ + + def __init__( + self, + *, + betas, + model_mean_type, + model_var_type, + loss_type, + rescale_timesteps=False, + ): + self.model_mean_type = model_mean_type + self.model_var_type = model_var_type + self.loss_type = loss_type + self.rescale_timesteps = rescale_timesteps + + # Use float64 for accuracy. + betas = np.array(betas, dtype=np.float64) + self.betas = betas + assert len(betas.shape) == 1, "betas must be 1-D" + assert (betas > 0).all() and (betas <= 1).all() + + self.num_timesteps = int(betas.shape[0]) + + alphas = 1.0 - betas + self.alphas_cumprod = np.cumprod(alphas, axis=0) + self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) + self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) + assert self.alphas_cumprod_prev.shape == (self.num_timesteps,) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) + self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) + self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) + self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) + self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + self.posterior_variance = ( + betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + # log calculation clipped because the posterior variance is 0 at the + # beginning of the diffusion chain. + self.posterior_log_variance_clipped = np.log( + np.append(self.posterior_variance[1], self.posterior_variance[1:]) + ) + self.posterior_mean_coef1 = ( + betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod) + ) + self.posterior_mean_coef2 = ( + (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod) + ) + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + ) + variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = _extract_into_tensor( + self.log_one_minus_alphas_cumprod, t, x_start.shape + ) + return mean, variance, log_variance + + def q_sample(self, x_start, t, noise=None): + """ + Diffuse the data for a given number of diffusion steps. + + In other words, sample from q(x_t | x_0). + + :param x_start: the initial data batch. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :param noise: if specified, the split-out normal noise. + :return: A noisy version of x_start. + """ + if noise is None: + noise = th.randn_like(x_start) + assert noise.shape == x_start.shape + return ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise + ) + + def q_posterior_mean_variance(self, x_start, x_t, t): + """ + Compute the mean and variance of the diffusion posterior: + + q(x_{t-1} | x_t, x_0) + + """ + assert x_start.shape == x_t.shape + posterior_mean = ( + _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape + ) + assert ( + posterior_mean.shape[0] == posterior_variance.shape[0] == posterior_log_variance_clipped.shape[0] == x_start.shape[0] + ) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance( + self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None + ): + """ + Apply the model to get p(x_{t-1} | x_t), as well as a prediction of + the initial x, x_0. + + :param model: the model, which takes a signal and a batch of timesteps + as input. + :param x: the [N x C x ...] tensor at time t. + :param t: a 1-D Tensor of timesteps. + :param clip_denoised: if True, clip the denoised signal into [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. Applies before + clip_denoised. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict with the following keys: + - 'mean': the model mean output. + - 'variance': the model variance output. + - 'log_variance': the log of 'variance'. + - 'pred_xstart': the prediction for x_0. + """ + if model_kwargs is None: + model_kwargs = {} + + B, C = x.shape[:2] + assert t.shape == (B,) + model_output = model(x, self._scale_timesteps(t), **model_kwargs) + + if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]: + assert model_output.shape == (B, C * 2, *x.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + if self.model_var_type == ModelVarType.LEARNED: + model_log_variance = model_var_values + model_variance = th.exp(model_log_variance) + else: + min_log = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x.shape + ) + max_log = _extract_into_tensor(np.log(self.betas), t, x.shape) + # The model_var_values is [-1, 1] for [min_var, max_var]. + frac = (model_var_values + 1) / 2 + model_log_variance = frac * max_log + (1 - frac) * min_log + model_variance = th.exp(model_log_variance) + else: + model_variance, model_log_variance = { + # for fixedlarge, we set the initial (log-)variance like so + # to get a better decoder log likelihood. + ModelVarType.FIXED_LARGE: ( + np.append(self.posterior_variance[1], self.betas[1:]), + np.log(np.append(self.posterior_variance[1], self.betas[1:])), + ), + ModelVarType.FIXED_SMALL: ( + self.posterior_variance, + self.posterior_log_variance_clipped, + ), + }[self.model_var_type] + model_variance = _extract_into_tensor(model_variance, t, x.shape) + model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape) + + def process_xstart(x): + if denoised_fn is not None: + x = denoised_fn(x) + if clip_denoised: + return x.clamp(-1, 1) + return x + + if self.model_mean_type == ModelMeanType.PREVIOUS_X: + pred_xstart = process_xstart( + self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output) + ) + model_mean = model_output + elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]: + if self.model_mean_type == ModelMeanType.START_X: + pred_xstart = process_xstart(model_output) + else: + pred_xstart = process_xstart( + self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output) + ) + model_mean, _, _ = self.q_posterior_mean_variance( + x_start=pred_xstart, x_t=x, t=t + ) + else: + raise NotImplementedError(self.model_mean_type) + + assert ( + model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape + ) + return { + "mean": model_mean, + "variance": model_variance, + "log_variance": model_log_variance, + "pred_xstart": pred_xstart, + } + + def _predict_xstart_from_eps(self, x_t, t, eps): + assert x_t.shape == eps.shape + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps + ) + + def _predict_xstart_from_xprev(self, x_t, t, xprev): + assert x_t.shape == xprev.shape + return ( # (xprev - coef2*x_t) / coef1 + _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev - _extract_into_tensor(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape) * x_t + ) + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + return ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + + def _scale_timesteps(self, t): + if self.rescale_timesteps: + return t.float() * (1000.0 / self.num_timesteps) + return t + + def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) + new_mean = ( + p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() + ) + return new_mean + + def condition_mean_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, t, p_mean_var, **model_kwargs) + new_mean = ( + p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float() + ) + return new_mean + + def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn( + x, self._scale_timesteps(t), **model_kwargs + ) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance( + x_start=out["pred_xstart"], x_t=x, t=t + ) + return out + + def condition_score_with_grad(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn( + x, t, p_mean_var, **model_kwargs + ) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance( + x_start=out["pred_xstart"], x_t=x, t=t + ) + return out + + def p_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + noise = th.randn_like(x) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean( + cond_fn, out, x, t, model_kwargs=model_kwargs + ) + sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def p_sample_with_grad( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + with th.enable_grad(): + x = x.detach().requires_grad_() + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + noise = th.randn_like(x) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean_with_grad( + cond_fn, out, x, t, model_kwargs=model_kwargs + ) + sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"].detach()} + + def p_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + ): + """ + Generate samples from the model. + + :param model: the model module. + :param shape: the shape of the samples, (N, C, H, W). + :param noise: if specified, the noise from the encoder to sample. + Should be of the same shape as `shape`. + :param clip_denoised: if True, clip x_start predictions to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param device: if specified, the device to create the samples on. + If not specified, use a model parameter's device. + :param progress: if True, show a tqdm progress bar. + :return: a non-differentiable batch of samples. + """ + final = None + for sample in self.p_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + skip_timesteps=skip_timesteps, + init_image=init_image, + randomize_class=randomize_class, + cond_fn_with_grad=cond_fn_with_grad, + ): + final = sample + return final["sample"] + + def p_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + ): + """ + Generate samples from the model and yield intermediate samples from + each timestep of diffusion. + + Arguments are the same as p_sample_loop(). + Returns a generator over dicts, where each dict is the return value of + p_sample(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + + if skip_timesteps and init_image is None: + init_image = th.zeros_like(img) + + indices = list(range(self.num_timesteps - skip_timesteps))[::-1] + + if init_image is not None: + my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0] + img = self.q_sample(init_image, my_t, img) + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices, desc="Steps") + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + if randomize_class and 'y' in model_kwargs: + model_kwargs['y'] = th.randint(low=0, high=model.num_classes, + size=model_kwargs['y'].shape, + device=model_kwargs['y'].device) + with th.no_grad(): + sample_fn = self.p_sample_with_grad if cond_fn_with_grad else self.p_sample + out = sample_fn( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + ) + yield out + img = out["sample"] + + def ddim_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + inpainting_mode=False, + orig_img=None, + mask_inpaint=None, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + if inpainting_mode: + noised_orig_img = th.sqrt(alpha_bar) * orig_img + \ + th.sqrt(1 - alpha_bar) * th.randn_like(x) + # noised_orig_img_pil = TF.to_pil_image(noised_orig_img[0].add(1).div(2).clamp(0, 1)) + # noised_orig_img_pil.save(f'/content/drive/MyDrive/AI/Disco_Diffusion/images_out/InpaintingTest/inpainting_dump/noised_orig_{t[0].item()}.png') + x = (1 - mask_inpaint) * noised_orig_img + mask_inpaint * x + # mixed_x = TF.to_pil_image(x[0].add(1).div(2).clamp(0, 1)) + # mixed_x.save(f'/content/drive/MyDrive/AI/Disco_Diffusion/images_out/InpaintingTest/inpainting_dump/mixed_x_{t[0].item()}.png') + + out_orig = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score(cond_fn, out_orig, x, t, model_kwargs=model_kwargs) + else: + out = out_orig + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = ( + eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev) + ) + # Equation 12. + noise = th.randn_like(x) + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps + ) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out_orig["pred_xstart"]} + + def ddim_sample_with_grad( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + with th.enable_grad(): + x = x.detach().requires_grad_() + out_orig = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score_with_grad(cond_fn, out_orig, x, t, + model_kwargs=model_kwargs) + else: + out = out_orig + + out["pred_xstart"] = out["pred_xstart"].detach() + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = ( + eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev) + ) + # Equation 12. + noise = th.randn_like(x) + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps + ) + nonzero_mask = ( + (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out_orig["pred_xstart"].detach()} + + def ddim_reverse_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t+1} from the model using DDIM reverse ODE. + """ + assert eta == 0.0, "Reverse ODE only for deterministic path" + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) + alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) + + # Equation 12. reversed + mean_pred = ( + out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps + ) + + return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} + + def ddim_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + ): + """ + Generate samples from the model using DDIM. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.ddim_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + eta=eta, + skip_timesteps=skip_timesteps, + init_image=init_image, + randomize_class=randomize_class, + cond_fn_with_grad=cond_fn_with_grad, + ): + final = sample + return final["sample"] + + def ddim_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + transformation_fn=None, + transformation_percent=[], + inpainting_mode=False, + mask_inpaint=None, + skip_timesteps_orig=None + ): + """ + Use DDIM to sample from the model and yield intermediate samples from + each timestep of DDIM. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + + if skip_timesteps and init_image is None: + init_image = th.zeros_like(img) + + indices = list(range(self.num_timesteps - skip_timesteps))[::-1] + transformation_steps = [int(len(indices) * (1 - i)) for i in transformation_percent] + + if init_image is not None: + my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0] + img = self.q_sample(init_image, my_t, img) + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + indices = tqdm(indices, desc="Steps") + + if inpainting_mode and skip_timesteps_orig is None: + skip_timesteps_orig = self.num_timesteps + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + if randomize_class and 'y' in model_kwargs: + model_kwargs['y'] = th.randint(low=0, high=model.num_classes, + size=model_kwargs['y'].shape, + device=model_kwargs['y'].device) + with th.no_grad(): + if i in transformation_steps and transformation_fn is not None: + img = transformation_fn(img) + sample_fn = self.ddim_sample_with_grad if cond_fn_with_grad else self.ddim_sample + if inpainting_mode \ + and i >= self.num_timesteps - skip_timesteps_orig \ + and not cond_fn_with_grad: + out = sample_fn( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + eta=eta, + inpainting_mode=inpainting_mode, + orig_img=init_image, + mask_inpaint=mask_inpaint, + ) + else: + out = sample_fn( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + eta=eta, + ) + yield out + img = out["sample"] + + def plms_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + cond_fn_with_grad=False, + order=2, + old_out=None, + ): + """ + Sample x_{t-1} from the model using Pseudo Linear Multistep. + + Same usage as p_sample(). + """ + if not int(order) or not 1 <= order <= 4: + raise ValueError('order is invalid (should be int from 1-4).') + + def get_model_output(x, t): + with th.set_grad_enabled(cond_fn_with_grad and cond_fn is not None): + x = x.detach().requires_grad_() if cond_fn_with_grad else x + out_orig = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + if cond_fn_with_grad: + out = self.condition_score_with_grad(cond_fn, out_orig, x, t, model_kwargs=model_kwargs) + x = x.detach() + else: + out = self.condition_score(cond_fn, out_orig, x, t, model_kwargs=model_kwargs) + else: + out = out_orig + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + return eps, out, out_orig + + # alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + eps, out, out_orig = get_model_output(x, t) + + if order > 1 and old_out is None: + # Pseudo Improved Euler + old_eps = [eps] + mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev) * eps + eps_2, _, _ = get_model_output(mean_pred, t - 1) + eps_prime = (eps + eps_2) / 2 + pred_prime = self._predict_xstart_from_eps(x, t, eps_prime) + mean_pred = pred_prime * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev) * eps_prime + else: + # Pseudo Linear Multistep (Adams-Bashforth) + old_eps = old_out["old_eps"] + old_eps.append(eps) + cur_order = min(order, len(old_eps)) + if cur_order == 1: + eps_prime = old_eps[-1] + elif cur_order == 2: + eps_prime = (3 * old_eps[-1] - old_eps[-2]) / 2 + elif cur_order == 3: + eps_prime = (23 * old_eps[-1] - 16 * old_eps[-2] + 5 * old_eps[-3]) / 12 + elif cur_order == 4: + eps_prime = (55 * old_eps[-1] - 59 * old_eps[-2] + 37 * old_eps[-3] - 9 * old_eps[-4]) / 24 + else: + raise RuntimeError('cur_order is invalid.') + pred_prime = self._predict_xstart_from_eps(x, t, eps_prime) + mean_pred = pred_prime * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev) * eps_prime + + if len(old_eps) >= order: + old_eps.pop(0) + + nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + sample = mean_pred * nonzero_mask + out["pred_xstart"] * (1 - nonzero_mask) + + return {"sample": sample, "pred_xstart": out_orig["pred_xstart"], "old_eps": old_eps} + + def plms_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + order=2, + ): + """ + Generate samples from the model using Pseudo Linear Multistep. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.plms_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + skip_timesteps=skip_timesteps, + init_image=init_image, + randomize_class=randomize_class, + cond_fn_with_grad=cond_fn_with_grad, + order=order, + ): + final = sample + return final["sample"] + + def plms_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + skip_timesteps=0, + init_image=None, + randomize_class=False, + cond_fn_with_grad=False, + order=2, + ): + """ + Use PLMS to sample from the model and yield intermediate samples from each + timestep of PLMS. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + + if skip_timesteps and init_image is None: + init_image = th.zeros_like(img) + + indices = list(range(self.num_timesteps - skip_timesteps))[::-1] + + if init_image is not None: + my_t = th.ones([shape[0]], device=device, dtype=th.long) * indices[0] + img = self.q_sample(init_image, my_t, img) + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices, desc="Steps") + + old_out = None + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + if randomize_class and 'y' in model_kwargs: + model_kwargs['y'] = th.randint(low=0, high=model.num_classes, + size=model_kwargs['y'].shape, + device=model_kwargs['y'].device) + with th.no_grad(): + out = self.plms_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + cond_fn_with_grad=cond_fn_with_grad, + order=order, + old_out=old_out, + ) + yield out + old_out = out + img = out["sample"] + + def _vb_terms_bpd( + self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None + ): + """ + Get a term for the variational lower-bound. + + The resulting units are bits (rather than nats, as one might expect). + This allows for comparison to other papers. + + :return: a dict with the following keys: + - 'output': a shape [N] tensor of NLLs or KLs. + - 'pred_xstart': the x_0 predictions. + """ + true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + ) + out = self.p_mean_variance( + model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs + ) + kl = normal_kl( + true_mean, true_log_variance_clipped, out["mean"], out["log_variance"] + ) + kl = mean_flat(kl) / np.log(2.0) + + decoder_nll = -discretized_gaussian_log_likelihood( + x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] + ) + assert decoder_nll.shape == x_start.shape + decoder_nll = mean_flat(decoder_nll) / np.log(2.0) + + # At the first timestep return the decoder NLL, + # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) + output = th.where((t == 0), decoder_nll, kl) + return {"output": output, "pred_xstart": out["pred_xstart"]} + + def training_losses(self, model, x_start, t, model_kwargs=None, noise=None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start, t, noise=noise) + + terms = {} + + if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL: + terms["loss"] = self._vb_terms_bpd( + model=model, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + model_kwargs=model_kwargs, + )["output"] + if self.loss_type == LossType.RESCALED_KL: + terms["loss"] *= self.num_timesteps + elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE: + model_output = model(x_t, self._scale_timesteps(t), **model_kwargs) + + if self.model_var_type in [ + ModelVarType.LEARNED, + ModelVarType.LEARNED_RANGE, + ]: + B, C = x_t.shape[:2] + assert model_output.shape == (B, C * 2, *x_t.shape[2:]) + model_output, model_var_values = th.split(model_output, C, dim=1) + # Learn the variance using the variational bound, but don't let + # it affect our mean prediction. + frozen_out = th.cat([model_output.detach(), model_var_values], dim=1) + terms["vb"] = self._vb_terms_bpd( + model=lambda *args, r=frozen_out: r, + x_start=x_start, + x_t=x_t, + t=t, + clip_denoised=False, + )["output"] + if self.loss_type == LossType.RESCALED_MSE: + # Divide by 1000 for equivalence with initial implementation. + # Without a factor of 1/1000, the VB term hurts the MSE term. + terms["vb"] *= self.num_timesteps / 1000.0 + + target = { + ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t + )[0], + ModelMeanType.START_X: x_start, + ModelMeanType.EPSILON: noise, + }[self.model_mean_type] + assert model_output.shape == target.shape == x_start.shape + terms["mse"] = mean_flat((target - model_output) ** 2) + if "vb" in terms: + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + return terms + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + + This term can't be optimized, as it only depends on the encoder. + + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl( + mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0 + ) + return mean_flat(kl_prior) / np.log(2.0) + + def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None): + """ + Compute the entire variational lower-bound, measured in bits-per-dim, + as well as other related quantities. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param clip_denoised: if True, clip denoised samples. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + + :return: a dict containing the following keys: + - total_bpd: the total variational lower-bound, per batch element. + - prior_bpd: the prior term in the lower-bound. + - vb: an [N x T] tensor of terms in the lower-bound. + - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. + - mse: an [N x T] tensor of epsilon MSEs for each timestep. + """ + device = x_start.device + batch_size = x_start.shape[0] + + vb = [] + xstart_mse = [] + mse = [] + for t in list(range(self.num_timesteps))[::-1]: + t_batch = th.tensor([t] * batch_size, device=device) + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) + # Calculate VLB term at the current timestep + with th.no_grad(): + out = self._vb_terms_bpd( + model, + x_start=x_start, + x_t=x_t, + t=t_batch, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + ) + vb.append(out["output"]) + xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2)) + eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"]) + mse.append(mean_flat((eps - noise) ** 2)) + + vb = th.stack(vb, dim=1) + xstart_mse = th.stack(xstart_mse, dim=1) + mse = th.stack(mse, dim=1) + + prior_bpd = self._prior_bpd(x_start) + total_bpd = vb.sum(dim=1) + prior_bpd + return { + "total_bpd": total_bpd, + "prior_bpd": prior_bpd, + "vb": vb, + "xstart_mse": xstart_mse, + "mse": mse, + } + + +def _extract_into_tensor(arr, timesteps, broadcast_shape): + """ + Extract values from a 1-D numpy array for a batch of indices. + + :param arr: the 1-D numpy array. + :param timesteps: a tensor of indices into the array to extract. + :param broadcast_shape: a larger shape of K dimensions with the batch + dimension equal to the length of timesteps. + :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. + """ + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + while len(res.shape) < len(broadcast_shape): + res = res[..., None] + return res.expand(broadcast_shape) diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/logger.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..9bdfc7b807ed34ac2334f01b9b09288c488de54e --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/logger.py @@ -0,0 +1,493 @@ +""" +Logger copied from OpenAI baselines to avoid extra RL-based dependencies: +https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/logger.py +""" + +import os +import sys +import os.path as osp +import json +import time +import datetime +import tempfile +import warnings +from collections import defaultdict +from contextlib import contextmanager + +DEBUG = 10 +INFO = 20 +WARN = 30 +ERROR = 40 + +DISABLED = 50 + + +class KVWriter(object): + def writekvs(self, kvs): + raise NotImplementedError + + +class SeqWriter(object): + def writeseq(self, seq): + raise NotImplementedError + + +class HumanOutputFormat(KVWriter, SeqWriter): + def __init__(self, filename_or_file): + if isinstance(filename_or_file, str): + self.file = open(filename_or_file, "wt") + self.own_file = True + else: + assert hasattr(filename_or_file, "read"), ( + "expected file or str, got %s" % filename_or_file + ) + self.file = filename_or_file + self.own_file = False + + def writekvs(self, kvs): + # Create strings for printing + key2str = {} + for (key, val) in sorted(kvs.items()): + if hasattr(val, "__float__"): + valstr = "%-8.3g" % val + else: + valstr = str(val) + key2str[self._truncate(key)] = self._truncate(valstr) + + # Find max widths + if len(key2str) == 0: + print("WARNING: tried to write empty key-value dict") + return + else: + keywidth = max(map(len, key2str.keys())) + valwidth = max(map(len, key2str.values())) + + # Write out the data + dashes = "-" * (keywidth + valwidth + 7) + lines = [dashes] + for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()): + lines.append( + "| %s%s | %s%s |" + % (key, " " * (keywidth - len(key)), val, " " * (valwidth - len(val))) + ) + lines.append(dashes) + self.file.write("\n".join(lines) + "\n") + + # Flush the output to the file + self.file.flush() + + def _truncate(self, s): + maxlen = 30 + return s[: maxlen - 3] + "..." if len(s) > maxlen else s + + def writeseq(self, seq): + seq = list(seq) + for (i, elem) in enumerate(seq): + self.file.write(elem) + if i < len(seq) - 1: # add space unless this is the last one + self.file.write(" ") + self.file.write("\n") + self.file.flush() + + def close(self): + if self.own_file: + self.file.close() + + +class JSONOutputFormat(KVWriter): + def __init__(self, filename): + self.file = open(filename, "wt") + + def writekvs(self, kvs): + for k, v in sorted(kvs.items()): + if hasattr(v, "dtype"): + kvs[k] = float(v) + self.file.write(json.dumps(kvs) + "\n") + self.file.flush() + + def close(self): + self.file.close() + + +class CSVOutputFormat(KVWriter): + def __init__(self, filename): + self.file = open(filename, "w+t") + self.keys = [] + self.sep = "," + + def writekvs(self, kvs): + # Add our current row to the history + extra_keys = list(kvs.keys() - self.keys) + extra_keys.sort() + if extra_keys: + self.keys.extend(extra_keys) + self.file.seek(0) + lines = self.file.readlines() + self.file.seek(0) + for (i, k) in enumerate(self.keys): + if i > 0: + self.file.write(",") + self.file.write(k) + self.file.write("\n") + for line in lines[1:]: + self.file.write(line[:-1]) + self.file.write(self.sep * len(extra_keys)) + self.file.write("\n") + for (i, k) in enumerate(self.keys): + if i > 0: + self.file.write(",") + v = kvs.get(k) + if v is not None: + self.file.write(str(v)) + self.file.write("\n") + self.file.flush() + + def close(self): + self.file.close() + + +class TensorBoardOutputFormat(KVWriter): + """ + Dumps key/value pairs into TensorBoard's numeric format. + """ + + def __init__(self, dir): + os.makedirs(dir, exist_ok=True) + self.dir = dir + self.step = 1 + prefix = "events" + path = osp.join(osp.abspath(dir), prefix) + import tensorflow as tf + from tensorflow.python import pywrap_tensorflow + from tensorflow.core.util import event_pb2 + from tensorflow.python.util import compat + + self.tf = tf + self.event_pb2 = event_pb2 + self.pywrap_tensorflow = pywrap_tensorflow + self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) + + def writekvs(self, kvs): + def summary_val(k, v): + kwargs = {"tag": k, "simple_value": float(v)} + return self.tf.Summary.Value(**kwargs) + + summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) + event = self.event_pb2.Event(wall_time=time.time(), summary=summary) + event.step = ( + self.step + ) # is there any reason why you'd want to specify the step? + self.writer.WriteEvent(event) + self.writer.Flush() + self.step += 1 + + def close(self): + if self.writer: + self.writer.Close() + self.writer = None + + +def make_output_format(format, ev_dir, log_suffix=""): + os.makedirs(ev_dir, exist_ok=True) + if format == "stdout": + return HumanOutputFormat(sys.stdout) + elif format == "log": + return HumanOutputFormat(osp.join(ev_dir, "log%s.txt" % log_suffix)) + elif format == "json": + return JSONOutputFormat(osp.join(ev_dir, "progress%s.json" % log_suffix)) + elif format == "csv": + return CSVOutputFormat(osp.join(ev_dir, "progress%s.csv" % log_suffix)) + elif format == "tensorboard": + return TensorBoardOutputFormat(osp.join(ev_dir, "tb%s" % log_suffix)) + else: + raise ValueError("Unknown format specified: %s" % (format,)) + + +# ================================================================ +# API +# ================================================================ + + +def logkv(key, val): + """ + Log a value of some diagnostic + Call this once for each diagnostic quantity, each iteration + If called many times, last value will be used. + """ + get_current().logkv(key, val) + + +def logkv_mean(key, val): + """ + The same as logkv(), but if called many times, values averaged. + """ + get_current().logkv_mean(key, val) + + +def logkvs(d): + """ + Log a dictionary of key-value pairs + """ + for (k, v) in d.items(): + logkv(k, v) + + +def dumpkvs(): + """ + Write all of the diagnostics from the current iteration + """ + return get_current().dumpkvs() + + +def getkvs(): + return get_current().name2val + + +def log(*args, level=INFO): + """ + Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). + """ + get_current().log(*args, level=level) + + +def debug(*args): + log(*args, level=DEBUG) + + +def info(*args): + log(*args, level=INFO) + + +def warn(*args): + log(*args, level=WARN) + + +def error(*args): + log(*args, level=ERROR) + + +def set_level(level): + """ + Set logging threshold on current logger. + """ + get_current().set_level(level) + + +def set_comm(comm): + get_current().set_comm(comm) + + +def get_dir(): + """ + Get directory that log files are being written to. + will be None if there is no output directory (i.e., if you didn't call start) + """ + return get_current().get_dir() + + +record_tabular = logkv +dump_tabular = dumpkvs + + +@contextmanager +def profile_kv(scopename): + logkey = "wait_" + scopename + tstart = time.time() + try: + yield + finally: + get_current().name2val[logkey] += time.time() - tstart + + +def profile(n): + """ + Usage: + @profile("my_func") + def my_func(): code + """ + + def decorator_with_name(func): + def func_wrapper(*args, **kwargs): + with profile_kv(n): + return func(*args, **kwargs) + + return func_wrapper + + return decorator_with_name + + +# ================================================================ +# Backend +# ================================================================ + + +def get_current(): + if Logger.CURRENT is None: + _configure_default_logger() + + return Logger.CURRENT + + +class Logger(object): + DEFAULT = None # A logger with no output files. (See right below class definition) + # So that you can still log to the terminal without setting up any output files + CURRENT = None # Current logger being used by the free functions above + + def __init__(self, dir, output_formats, comm=None): + self.name2val = defaultdict(float) # values this iteration + self.name2cnt = defaultdict(int) + self.level = INFO + self.dir = dir + self.output_formats = output_formats + self.comm = comm + + # Logging API, forwarded + # ---------------------------------------- + def logkv(self, key, val): + self.name2val[key] = val + + def logkv_mean(self, key, val): + oldval, cnt = self.name2val[key], self.name2cnt[key] + self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1) + self.name2cnt[key] = cnt + 1 + + def dumpkvs(self): + if self.comm is None: + d = self.name2val + else: + d = mpi_weighted_mean( + self.comm, + { + name: (val, self.name2cnt.get(name, 1)) + for (name, val) in self.name2val.items() + }, + ) + if self.comm.rank != 0: + d["dummy"] = 1 # so we don't get a warning about empty dict + out = d.copy() # Return the dict for unit testing purposes + for fmt in self.output_formats: + if isinstance(fmt, KVWriter): + fmt.writekvs(d) + self.name2val.clear() + self.name2cnt.clear() + return out + + def log(self, *args, level=INFO): + if self.level <= level: + self._do_log(args) + + # Configuration + # ---------------------------------------- + def set_level(self, level): + self.level = level + + def set_comm(self, comm): + self.comm = comm + + def get_dir(self): + return self.dir + + def close(self): + for fmt in self.output_formats: + fmt.close() + + # Misc + # ---------------------------------------- + def _do_log(self, args): + for fmt in self.output_formats: + if isinstance(fmt, SeqWriter): + fmt.writeseq(map(str, args)) + + +def get_rank_without_mpi_import(): + # check environment variables here instead of importing mpi4py + # to avoid calling MPI_Init() when this module is imported + for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]: + if varname in os.environ: + return int(os.environ[varname]) + return 0 + + +def mpi_weighted_mean(comm, local_name2valcount): + """ + Copied from: https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/mpi_util.py#L110 + Perform a weighted average over dicts that are each on a different node + Input: local_name2valcount: dict mapping key -> (value, count) + Returns: key -> mean + """ + all_name2valcount = comm.gather(local_name2valcount) + if comm.rank == 0: + name2sum = defaultdict(float) + name2count = defaultdict(float) + for n2vc in all_name2valcount: + for (name, (val, count)) in n2vc.items(): + try: + val = float(val) + except ValueError: + if comm.rank == 0: + warnings.warn( + "WARNING: tried to compute mean on non-float {}={}".format( + name, val + ) + ) + else: + name2sum[name] += val * count + name2count[name] += count + return {name: name2sum[name] / name2count[name] for name in name2sum} + else: + return {} + + +def configure(dir=None, format_strs=None, comm=None, log_suffix=""): + """ + If comm is provided, average all numerical stats across that comm + """ + if dir is None: + dir = os.getenv("OPENAI_LOGDIR") + if dir is None: + dir = osp.join( + tempfile.gettempdir(), + datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"), + ) + assert isinstance(dir, str) + dir = os.path.expanduser(dir) + os.makedirs(os.path.expanduser(dir), exist_ok=True) + + rank = get_rank_without_mpi_import() + if rank > 0: + log_suffix = log_suffix + "-rank%03i" % rank + + if format_strs is None: + if rank == 0: + format_strs = os.getenv("OPENAI_LOG_FORMAT", "stdout,log,csv").split(",") + else: + format_strs = os.getenv("OPENAI_LOG_FORMAT_MPI", "log").split(",") + format_strs = filter(None, format_strs) + output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs] + + Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm) + if output_formats: + log("Logging to %s" % dir) + + +def _configure_default_logger(): + configure() + Logger.DEFAULT = Logger.CURRENT + + +def reset(): + if Logger.CURRENT is not Logger.DEFAULT: + Logger.CURRENT.close() + Logger.CURRENT = Logger.DEFAULT + log("Reset logger") + + +@contextmanager +def scoped_configure(dir=None, format_strs=None, comm=None): + prevlogger = Logger.CURRENT + configure(dir=dir, format_strs=format_strs, comm=comm) + try: + yield + finally: + Logger.CURRENT.close() + Logger.CURRENT = prevlogger diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/losses.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..162771bf5df31a008b1f4b6b27bd42432955fdc9 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/losses.py @@ -0,0 +1,73 @@ +""" +Helpers for various likelihood-based losses. These are ported from the original +Ho et al. diffusion models codebase: +https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py +""" + +import numpy as np + +import torch as th + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + Compute the KL divergence between two gaussians. + + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, th.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for th.exp(). + logvar1, logvar2 = [ + x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2) + ) + + +def approx_standard_normal_cdf(x): + """ + A fast approximation of the cumulative distribution function of the + standard normal. + """ + return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) + + +def discretized_gaussian_log_likelihood(x, *, means, log_scales): + """ + Compute the log-likelihood of a Gaussian distribution discretizing to a + given image. + + :param x: the target images. It is assumed that this was uint8 values, + rescaled to the range [-1, 1]. + :param means: the Gaussian mean Tensor. + :param log_scales: the Gaussian log stddev Tensor. + :return: a tensor like x of log probabilities (in nats). + """ + assert x.shape == means.shape == log_scales.shape + centered_x = x - means + inv_stdv = th.exp(-log_scales) + plus_in = inv_stdv * (centered_x + 1.0 / 255.0) + cdf_plus = approx_standard_normal_cdf(plus_in) + min_in = inv_stdv * (centered_x - 1.0 / 255.0) + cdf_min = approx_standard_normal_cdf(min_in) + log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) + log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) + cdf_delta = cdf_plus - cdf_min + log_probs = th.where( + x < -0.999, + log_cdf_plus, + th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))), + ) + assert log_probs.shape == x.shape + return log_probs diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/nn.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/nn.py new file mode 100644 index 0000000000000000000000000000000000000000..b28bd83cf23b4e19868afc2075b11ca1cfbd0e8d --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/nn.py @@ -0,0 +1,190 @@ +""" +Various utilities for neural networks. +""" + +import math + +import torch as th +import torch.nn as nn + + +# PyTorch 1.7 has SiLU, but we support PyTorch 1.5. +class SiLU(nn.Module): + def forward(self, x): + return x * th.sigmoid(x) + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def linear(*args, **kwargs): + """ + Create a linear module. + """ + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def update_ema(target_params, source_params, rate=0.99): + """ + Update target parameters to be closer to those of source parameters using + an exponential moving average. + + :param target_params: the target parameter sequence. + :param source_params: the source parameter sequence. + :param rate: the EMA rate (closer to 1 means slower). + """ + for targ, src in zip(target_params, source_params): + targ.detach().mul_(rate).add_(src, alpha=1 - rate) + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def scale_module(module, scale): + """ + Scale the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def normalization(channels): + """ + Make a standard normalization layer. + + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(32, channels) + + +def timestep_embedding(timesteps, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = th.exp( + -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = th.cat([th.cos(args), th.sin(args)], dim=-1) + if dim % 2: + embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + +def checkpoint(func, inputs, params, flag): + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(th.autograd.Function): + @staticmethod + @th.cuda.amp.custom_fwd + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_length = length + ctx.save_for_backward(*args) + with th.no_grad(): + output_tensors = ctx.run_function(*args[:length]) + return output_tensors + + @staticmethod + @th.cuda.amp.custom_bwd + def backward(ctx, *output_grads): + args = list(ctx.saved_tensors) + + # Filter for inputs that require grad. If none, exit early. + input_indices = [i for (i, x) in enumerate(args) if x.requires_grad] + if not input_indices: + return (None, None) + tuple(None for _ in args) + + with th.enable_grad(): + for i in input_indices: + if i < ctx.input_length: + # Not sure why the OAI code does this little + # dance. It might not be necessary. + args[i] = args[i].detach().requires_grad_() + args[i] = args[i].view_as(args[i]) + output_tensors = ctx.run_function(*args[:ctx.input_length]) + + if isinstance(output_tensors, th.Tensor): + output_tensors = [output_tensors] + + # Filter for outputs that require grad. If none, exit early. + out_and_grads = [(o, g) for (o, g) in zip(output_tensors, output_grads) if o.requires_grad] + if not out_and_grads: + return (None, None) + tuple(None for _ in args) + + # Compute gradients on the filtered tensors. + computed_grads = th.autograd.grad( + [o for (o, g) in out_and_grads], + [args[i] for i in input_indices], + [g for (o, g) in out_and_grads] + ) + + # Reassemble the complete gradient tuple. + input_grads = [None for _ in args] + for (i, g) in zip(input_indices, computed_grads): + input_grads[i] = g + return (None, None) + tuple(input_grads) diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/resample.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/resample.py new file mode 100644 index 0000000000000000000000000000000000000000..c82eccdcd47c468d41e7cbe02de6a731f2c9bf81 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/resample.py @@ -0,0 +1,154 @@ +from abc import ABC, abstractmethod + +import numpy as np +import torch as th +import torch.distributed as dist + + +def create_named_schedule_sampler(name, diffusion): + """ + Create a ScheduleSampler from a library of pre-defined samplers. + + :param name: the name of the sampler. + :param diffusion: the diffusion object to sample for. + """ + if name == "uniform": + return UniformSampler(diffusion) + elif name == "loss-second-moment": + return LossSecondMomentResampler(diffusion) + else: + raise NotImplementedError(f"unknown schedule sampler: {name}") + + +class ScheduleSampler(ABC): + """ + A distribution over timesteps in the diffusion process, intended to reduce + variance of the objective. + + By default, samplers perform unbiased importance sampling, in which the + objective's mean is unchanged. + However, subclasses may override sample() to change how the resampled + terms are reweighted, allowing for actual changes in the objective. + """ + + @abstractmethod + def weights(self): + """ + Get a numpy array of weights, one per diffusion step. + + The weights needn't be normalized, but must be positive. + """ + + def sample(self, batch_size, device): + """ + Importance-sample timesteps for a batch. + + :param batch_size: the number of timesteps. + :param device: the torch device to save to. + :return: a tuple (timesteps, weights): + - timesteps: a tensor of timestep indices. + - weights: a tensor of weights to scale the resulting losses. + """ + w = self.weights() + p = w / np.sum(w) + indices_np = np.random.choice(len(p), size=(batch_size,), p=p) + indices = th.from_numpy(indices_np).long().to(device) + weights_np = 1 / (len(p) * p[indices_np]) + weights = th.from_numpy(weights_np).float().to(device) + return indices, weights + + +class UniformSampler(ScheduleSampler): + def __init__(self, diffusion): + self.diffusion = diffusion + self._weights = np.ones([diffusion.num_timesteps]) + + def weights(self): + return self._weights + + +class LossAwareSampler(ScheduleSampler): + def update_with_local_losses(self, local_ts, local_losses): + """ + Update the reweighting using losses from a model. + + Call this method from each rank with a batch of timesteps and the + corresponding losses for each of those timesteps. + This method will perform synchronization to make sure all of the ranks + maintain the exact same reweighting. + + :param local_ts: an integer Tensor of timesteps. + :param local_losses: a 1D Tensor of losses. + """ + batch_sizes = [ + th.tensor([0], dtype=th.int32, device=local_ts.device) + for _ in range(dist.get_world_size()) + ] + dist.all_gather( + batch_sizes, + th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device), + ) + + # Pad all_gather batches to be the maximum batch size. + batch_sizes = [x.item() for x in batch_sizes] + max_bs = max(batch_sizes) + + timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes] + loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes] + dist.all_gather(timestep_batches, local_ts) + dist.all_gather(loss_batches, local_losses) + timesteps = [ + x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs] + ] + losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]] + self.update_with_all_losses(timesteps, losses) + + @abstractmethod + def update_with_all_losses(self, ts, losses): + """ + Update the reweighting using losses from a model. + + Sub-classes should override this method to update the reweighting + using losses from the model. + + This method directly updates the reweighting without synchronizing + between workers. It is called by update_with_local_losses from all + ranks with identical arguments. Thus, it should have deterministic + behavior to maintain state across workers. + + :param ts: a list of int timesteps. + :param losses: a list of float losses, one per timestep. + """ + + +class LossSecondMomentResampler(LossAwareSampler): + def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001): + self.diffusion = diffusion + self.history_per_term = history_per_term + self.uniform_prob = uniform_prob + self._loss_history = np.zeros( + [diffusion.num_timesteps, history_per_term], dtype=np.float64 + ) + self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int) + + def weights(self): + if not self._warmed_up(): + return np.ones([self.diffusion.num_timesteps], dtype=np.float64) + weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1)) + weights /= np.sum(weights) + weights *= 1 - self.uniform_prob + weights += self.uniform_prob / len(weights) + return weights + + def update_with_all_losses(self, ts, losses): + for t, loss in zip(ts, losses): + if self._loss_counts[t] == self.history_per_term: + # Shift out the oldest loss term. + self._loss_history[t, :-1] = self._loss_history[t, 1:] + self._loss_history[t, -1] = loss + else: + self._loss_history[t, self._loss_counts[t]] = loss + self._loss_counts[t] += 1 + + def _warmed_up(self): + return (self._loss_counts == self.history_per_term).all() diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/respace.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/respace.py new file mode 100644 index 0000000000000000000000000000000000000000..3e4ed31dba05d43cba1a262c0a166ab7df10fd9a --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/respace.py @@ -0,0 +1,128 @@ +import numpy as np +import torch as th + +from .gaussian_diffusion import GaussianDiffusion + + +def space_timesteps(num_timesteps, section_counts): + """ + Create a list of timesteps to use from an original diffusion process, + given the number of timesteps we want to take from equally-sized portions + of the original process. + + For example, if there's 300 timesteps and the section counts are [10,15,20] + then the first 100 timesteps are strided to be 10 timesteps, the second 100 + are strided to be 15 timesteps, and the final 100 are strided to be 20. + + If the stride is a string starting with "ddim", then the fixed striding + from the DDIM paper is used, and only one section is allowed. + + :param num_timesteps: the number of diffusion steps in the original + process to divide up. + :param section_counts: either a list of numbers, or a string containing + comma-separated numbers, indicating the step count + per section. As a special case, use "ddimN" where N + is a number of steps to use the striding from the + DDIM paper. + :return: a set of diffusion steps from the original process to use. + """ + if isinstance(section_counts, str): + if section_counts.startswith("ddim"): + desired_count = int(section_counts[len("ddim"):]) + for i in range(1, num_timesteps): + if len(range(0, num_timesteps, i)) == desired_count: + return set(range(0, num_timesteps, i)) + raise ValueError( + f"cannot create exactly {num_timesteps} steps with an integer stride" + ) + section_counts = [int(x) for x in section_counts.split(",")] + size_per = num_timesteps // len(section_counts) + extra = num_timesteps % len(section_counts) + start_idx = 0 + all_steps = [] + for i, section_count in enumerate(section_counts): + size = size_per + (1 if i < extra else 0) + if size < section_count: + raise ValueError( + f"cannot divide section of {size} steps into {section_count}" + ) + if section_count <= 1: + frac_stride = 1 + else: + frac_stride = (size - 1) / (section_count - 1) + cur_idx = 0.0 + taken_steps = [] + for _ in range(section_count): + taken_steps.append(start_idx + round(cur_idx)) + cur_idx += frac_stride + all_steps += taken_steps + start_idx += size + return set(all_steps) + + +class SpacedDiffusion(GaussianDiffusion): + """ + A diffusion process which can skip steps in a base diffusion process. + + :param use_timesteps: a collection (sequence or set) of timesteps from the + original diffusion process to retain. + :param kwargs: the kwargs to create the base diffusion process. + """ + + def __init__(self, use_timesteps, **kwargs): + self.use_timesteps = set(use_timesteps) + self.timestep_map = [] + self.original_num_steps = len(kwargs["betas"]) + + base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa + last_alpha_cumprod = 1.0 + new_betas = [] + for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): + if i in self.use_timesteps: + new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) + last_alpha_cumprod = alpha_cumprod + self.timestep_map.append(i) + kwargs["betas"] = np.array(new_betas) + super().__init__(**kwargs) + + def p_mean_variance( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().p_mean_variance(self._wrap_model(model), *args, **kwargs) + + def training_losses( + self, model, *args, **kwargs + ): # pylint: disable=signature-differs + return super().training_losses(self._wrap_model(model), *args, **kwargs) + + def condition_mean(self, cond_fn, *args, **kwargs): + return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs) + + def condition_score(self, cond_fn, *args, **kwargs): + return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs) + + def _wrap_model(self, model): + if isinstance(model, _WrappedModel): + return model + return _WrappedModel( + model, self.timestep_map, self.rescale_timesteps, self.original_num_steps + ) + + def _scale_timesteps(self, t): + # Scaling is done by the wrapped model. + return t + + +class _WrappedModel: + def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def __call__(self, x, ts, **kwargs): + map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype) + new_ts = map_tensor[ts] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return self.model(x, new_ts, **kwargs) diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/script_util.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/script_util.py new file mode 100644 index 0000000000000000000000000000000000000000..35af1fa83fc5588bd3a90e1200e13f70c342fcd7 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/script_util.py @@ -0,0 +1,456 @@ +import argparse +import inspect + +from . import gaussian_diffusion as gd +from .respace import SpacedDiffusion, space_timesteps +from .unet import SuperResModel, EncoderUNetModel + +NUM_CLASSES = 1000 + + +def diffusion_defaults(): + """ + Defaults for image and classifier training. + """ + return dict( + learn_sigma=False, + diffusion_steps=1000, + noise_schedule="linear", + timestep_respacing="", + use_kl=False, + predict_xstart=False, + rescale_timesteps=False, + rescale_learned_sigmas=False, + ) + + +def classifier_defaults(): + """ + Defaults for classifier models. + """ + return dict( + image_size=64, + classifier_use_fp16=False, + classifier_width=128, + classifier_depth=2, + classifier_attention_resolutions="32,16,8", # 16 + classifier_use_scale_shift_norm=True, # False + classifier_resblock_updown=True, # False + classifier_pool="attention", + ) + + +def model_and_diffusion_defaults(): + """ + Defaults for image training. + """ + res = dict( + image_size=64, + num_channels=128, + num_res_blocks=2, + num_heads=4, + num_heads_upsample=-1, + num_head_channels=-1, + attention_resolutions="16,8", + channel_mult="", + dropout=0.0, + class_cond=False, + use_checkpoint=False, + use_scale_shift_norm=True, + resblock_updown=False, + use_fp16=False, + use_new_attention_order=False, + ) + res.update(diffusion_defaults()) + return res + + +def classifier_and_diffusion_defaults(): + res = classifier_defaults() + res.update(diffusion_defaults()) + return res + + +def create_model_and_diffusion( + image_size, + class_cond, + learn_sigma, + num_channels, + num_res_blocks, + channel_mult, + num_heads, + num_head_channels, + num_heads_upsample, + attention_resolutions, + dropout, + diffusion_steps, + noise_schedule, + timestep_respacing, + use_kl, + predict_xstart, + rescale_timesteps, + rescale_learned_sigmas, + use_checkpoint, + use_scale_shift_norm, + resblock_updown, + use_fp16, + use_new_attention_order, +): + model = create_model( + image_size, + num_channels, + num_res_blocks, + channel_mult=channel_mult, + learn_sigma=learn_sigma, + class_cond=class_cond, + use_checkpoint=use_checkpoint, + attention_resolutions=attention_resolutions, + num_heads=num_heads, + num_head_channels=num_head_channels, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + dropout=dropout, + resblock_updown=resblock_updown, + use_fp16=use_fp16, + use_new_attention_order=use_new_attention_order, + ) + diffusion = create_gaussian_diffusion( + steps=diffusion_steps, + learn_sigma=learn_sigma, + noise_schedule=noise_schedule, + use_kl=use_kl, + predict_xstart=predict_xstart, + rescale_timesteps=rescale_timesteps, + rescale_learned_sigmas=rescale_learned_sigmas, + timestep_respacing=timestep_respacing, + ) + return model, diffusion + + +def create_model( + image_size, + num_channels, + num_res_blocks, + channel_mult="", + learn_sigma=False, + class_cond=False, + use_checkpoint=False, + attention_resolutions="16", + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + dropout=0, + resblock_updown=False, + use_fp16=False, + use_new_attention_order=False, +): + if channel_mult == "": + if image_size == 512: + channel_mult = (0.5, 1, 1, 2, 2, 4, 4) + elif image_size == 256: + channel_mult = (1, 1, 2, 2, 4, 4) + elif image_size == 128: + channel_mult = (1, 1, 2, 3, 4) + elif image_size == 64: + channel_mult = (1, 2, 3, 4) + else: + raise ValueError(f"unsupported image size: {image_size}") + else: + channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(",")) + + attention_ds = [] + for res in attention_resolutions.split(","): + attention_ds.append(image_size // int(res)) + + # config = UNetConfig() + # return HFUNetModel(config=config) + return None + + # return UNetModel( + # image_size=image_size, + # in_channels=3, + # model_channels=num_channels, + # out_channels=(3 if not learn_sigma else 6), + # num_res_blocks=num_res_blocks, + # attention_resolutions=tuple(attention_ds), + # dropout=dropout, + # channel_mult=channel_mult, + # num_classes=(NUM_CLASSES if class_cond else None), + # use_checkpoint=use_checkpoint, + # use_fp16=use_fp16, + # num_heads=num_heads, + # num_head_channels=num_head_channels, + # num_heads_upsample=num_heads_upsample, + # use_scale_shift_norm=use_scale_shift_norm, + # resblock_updown=resblock_updown, + # use_new_attention_order=use_new_attention_order, + # ) + + +def create_classifier_and_diffusion( + image_size, + classifier_use_fp16, + classifier_width, + classifier_depth, + classifier_attention_resolutions, + classifier_use_scale_shift_norm, + classifier_resblock_updown, + classifier_pool, + learn_sigma, + diffusion_steps, + noise_schedule, + timestep_respacing, + use_kl, + predict_xstart, + rescale_timesteps, + rescale_learned_sigmas, +): + classifier = create_classifier( + image_size, + classifier_use_fp16, + classifier_width, + classifier_depth, + classifier_attention_resolutions, + classifier_use_scale_shift_norm, + classifier_resblock_updown, + classifier_pool, + ) + diffusion = create_gaussian_diffusion( + steps=diffusion_steps, + learn_sigma=learn_sigma, + noise_schedule=noise_schedule, + use_kl=use_kl, + predict_xstart=predict_xstart, + rescale_timesteps=rescale_timesteps, + rescale_learned_sigmas=rescale_learned_sigmas, + timestep_respacing=timestep_respacing, + ) + return classifier, diffusion + + +def create_classifier( + image_size, + classifier_use_fp16, + classifier_width, + classifier_depth, + classifier_attention_resolutions, + classifier_use_scale_shift_norm, + classifier_resblock_updown, + classifier_pool, +): + if image_size == 512: + channel_mult = (0.5, 1, 1, 2, 2, 4, 4) + elif image_size == 256: + channel_mult = (1, 1, 2, 2, 4, 4) + elif image_size == 128: + channel_mult = (1, 1, 2, 3, 4) + elif image_size == 64: + channel_mult = (1, 2, 3, 4) + else: + raise ValueError(f"unsupported image size: {image_size}") + + attention_ds = [] + for res in classifier_attention_resolutions.split(","): + attention_ds.append(image_size // int(res)) + + return EncoderUNetModel( + image_size=image_size, + in_channels=3, + model_channels=classifier_width, + out_channels=1000, + num_res_blocks=classifier_depth, + attention_resolutions=tuple(attention_ds), + channel_mult=channel_mult, + use_fp16=classifier_use_fp16, + num_head_channels=64, + use_scale_shift_norm=classifier_use_scale_shift_norm, + resblock_updown=classifier_resblock_updown, + pool=classifier_pool, + ) + + +def sr_model_and_diffusion_defaults(): + res = model_and_diffusion_defaults() + res["large_size"] = 256 + res["small_size"] = 64 + arg_names = inspect.getfullargspec(sr_create_model_and_diffusion)[0] + for k in res.copy().keys(): + if k not in arg_names: + del res[k] + return res + + +def sr_create_model_and_diffusion( + large_size, + small_size, + class_cond, + learn_sigma, + num_channels, + num_res_blocks, + num_heads, + num_head_channels, + num_heads_upsample, + attention_resolutions, + dropout, + diffusion_steps, + noise_schedule, + timestep_respacing, + use_kl, + predict_xstart, + rescale_timesteps, + rescale_learned_sigmas, + use_checkpoint, + use_scale_shift_norm, + resblock_updown, + use_fp16, +): + model = sr_create_model( + large_size, + small_size, + num_channels, + num_res_blocks, + learn_sigma=learn_sigma, + class_cond=class_cond, + use_checkpoint=use_checkpoint, + attention_resolutions=attention_resolutions, + num_heads=num_heads, + num_head_channels=num_head_channels, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + dropout=dropout, + resblock_updown=resblock_updown, + use_fp16=use_fp16, + ) + diffusion = create_gaussian_diffusion( + steps=diffusion_steps, + learn_sigma=learn_sigma, + noise_schedule=noise_schedule, + use_kl=use_kl, + predict_xstart=predict_xstart, + rescale_timesteps=rescale_timesteps, + rescale_learned_sigmas=rescale_learned_sigmas, + timestep_respacing=timestep_respacing, + ) + return model, diffusion + + +def sr_create_model( + large_size, + small_size, + num_channels, + num_res_blocks, + learn_sigma, + class_cond, + use_checkpoint, + attention_resolutions, + num_heads, + num_head_channels, + num_heads_upsample, + use_scale_shift_norm, + dropout, + resblock_updown, + use_fp16, +): + _ = small_size # hack to prevent unused variable + + if large_size == 512: + channel_mult = (1, 1, 2, 2, 4, 4) + elif large_size == 256: + channel_mult = (1, 1, 2, 2, 4, 4) + elif large_size == 64: + channel_mult = (1, 2, 3, 4) + else: + raise ValueError(f"unsupported large size: {large_size}") + + attention_ds = [] + for res in attention_resolutions.split(","): + attention_ds.append(large_size // int(res)) + + return SuperResModel( + image_size=large_size, + in_channels=3, + model_channels=num_channels, + out_channels=(3 if not learn_sigma else 6), + num_res_blocks=num_res_blocks, + attention_resolutions=tuple(attention_ds), + dropout=dropout, + channel_mult=channel_mult, + num_classes=(NUM_CLASSES if class_cond else None), + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + resblock_updown=resblock_updown, + use_fp16=use_fp16, + ) + + +def create_gaussian_diffusion( + *, + steps=1000, + learn_sigma=False, + sigma_small=False, + noise_schedule="linear", + use_kl=False, + predict_xstart=False, + rescale_timesteps=False, + rescale_learned_sigmas=False, + timestep_respacing="", +): + betas = gd.get_named_beta_schedule(noise_schedule, steps) + if use_kl: + loss_type = gd.LossType.RESCALED_KL + elif rescale_learned_sigmas: + loss_type = gd.LossType.RESCALED_MSE + else: + loss_type = gd.LossType.MSE + if not timestep_respacing: + timestep_respacing = [steps] + return SpacedDiffusion( + use_timesteps=space_timesteps(steps, timestep_respacing), + betas=betas, + model_mean_type=( + gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X + ), + model_var_type=( + ( + gd.ModelVarType.FIXED_LARGE + if not sigma_small + else gd.ModelVarType.FIXED_SMALL + ) + if not learn_sigma + else gd.ModelVarType.LEARNED_RANGE + ), + loss_type=loss_type, + rescale_timesteps=rescale_timesteps, + ) + + +def add_dict_to_argparser(parser, default_dict): + for k, v in default_dict.items(): + v_type = type(v) + if v is None: + v_type = str + elif isinstance(v, bool): + v_type = str2bool + parser.add_argument(f"--{k}", default=v, type=v_type) + + +def args_to_dict(args, keys): + return {k: getattr(args, k) for k in keys} + + +def str2bool(v): + """ + https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse + """ + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("boolean value expected") diff --git a/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/unet.py b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/unet.py new file mode 100644 index 0000000000000000000000000000000000000000..187b6c9737fda143f70e8dae365c35b690820466 --- /dev/null +++ b/fengshen/examples/disco_project/guided_diffusion/guided_diffusion/unet.py @@ -0,0 +1,975 @@ +from abc import abstractmethod + +import math + +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + +from .fp16_util import convert_module_to_f16, convert_module_to_f32 +from .nn import ( + checkpoint, + conv_nd, + linear, + avg_pool_nd, + zero_module, + normalization, + timestep_embedding, +) + +from transformers import PreTrainedModel, PretrainedConfig + + +class AttentionPool2d(nn.Module): + """ + Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py + """ + + def __init__( + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, + ): + super().__init__() + self.positional_embedding = nn.Parameter( + th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5 + ) + self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) + self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) + self.num_heads = embed_dim // num_heads_channels + self.attention = QKVAttention(self.num_heads) + + def forward(self, x): + b, c, *_spatial = x.shape + x = x.reshape(b, c, -1) # NC(HW) + x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1) + x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1) + x = self.qkv_proj(x) + x = self.attention(x) + x = self.c_proj(x) + return x[:, :, 0] + + +class TimestepBlock(nn.Module): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + + def forward(self, x, emb): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + else: + x = layer(x) + return x + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate( + x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest" + ) + else: + x = F.interpolate(x, scale_factor=2, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd( + dims, self.channels, self.out_channels, 3, stride=stride, padding=1 + ) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial + convolution instead of a smaller 1x1 convolution to change the + channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param use_checkpoint: if True, use gradient checkpointing on this module. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + nn.SiLU(), + linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1) + ), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd( + dims, channels, self.out_channels, 3, padding=1 + ) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 1) + + def forward(self, x, emb): + """ + Apply the block to a Tensor, conditioned on a timestep embedding. + + :param x: an [N x C x ...] Tensor of features. + :param emb: an [N x emb_channels] Tensor of timestep embeddings. + :return: an [N x C x ...] Tensor of outputs. + """ + return checkpoint( + self._forward, (x, emb), self.parameters(), self.use_checkpoint + ) + + def _forward(self, x, emb): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = th.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) + + def forward(self, x): + return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint) + + def _forward(self, x): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +def count_flops_attn(model, _x, y): + """ + A counter for the `thop` package to count the operations in an + attention operation. + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial ** 2) * c + model.total_ops += th.DoubleTensor([matmul_ops]) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", q * scale, k * scale + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, v) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttention(nn.Module): + """ + A module which performs QKV attention and splits in a different order. + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", + (q * scale).view(bs * self.n_heads, ch, length), + (k * scale).view(bs * self.n_heads, ch, length), + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class UNetModel(nn.Module): + """ + The full UNet model with attention and timestep embedding. + + :param in_channels: channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which + attention will take place. May be a set, list, or tuple. + For example, if this contains 4, then at 4x downsampling, attention + will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and + downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be + class-conditional with `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use + a fixed channel width per attention head. + :param num_heads_upsample: works with num_heads to set a different number + of heads for upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + :param use_new_attention_order: use a different attention pattern for potentially + increased efficiency. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + num_classes=None, + use_checkpoint=False, + use_fp16=False, + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + ): + super().__init__() + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + if self.num_classes is not None: + self.label_emb = nn.Embedding(num_classes, time_embed_dim) + + ch = input_ch = int(channel_mult[0] * model_channels) + self.input_blocks = nn.ModuleList( + [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))] + ) + self._feature_size = ch + input_block_chans = [ch] + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=int(mult * model_channels), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(mult * model_channels) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) + if resblock_updown + else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch + ) + ) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + + self.output_blocks = nn.ModuleList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(num_res_blocks + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock( + ch + ich, + time_embed_dim, + dropout, + out_channels=int(model_channels * mult), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(model_channels * mult) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads_upsample, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ) + ) + if level and i == num_res_blocks: + out_ch = ch + layers.append( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True, + ) + if resblock_updown + else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)), + ) + + def convert_to_fp16(self): + """ + Convert the torso of the model to float16. + """ + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + self.output_blocks.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """ + Convert the torso of the model to float32. + """ + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + self.output_blocks.apply(convert_module_to_f32) + + def forward(self, x, timesteps, y=None): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param y: an [N] Tensor of labels, if class-conditional. + :return: an [N x C x ...] Tensor of outputs. + """ + assert (y is not None) == ( + self.num_classes is not None + ), "must specify y if and only if the model is class-conditional" + + hs = [] + emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + + if self.num_classes is not None: + assert y.shape == (x.shape[0],) + emb = emb + self.label_emb(y) + + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb) + hs.append(h) + h = self.middle_block(h, emb) + for module in self.output_blocks: + h = th.cat([h, hs.pop()], dim=1) + h = module(h, emb) + h = h.type(x.dtype) + return self.out(h) + + +class SuperResModel(UNetModel): + """ + A UNetModel that performs super-resolution. + + Expects an extra kwarg `low_res` to condition on a low-resolution image. + """ + + def __init__(self, image_size, in_channels, *args, **kwargs): + super().__init__(image_size, in_channels * 2, *args, **kwargs) + + def forward(self, x, timesteps, low_res=None, **kwargs): + _, _, new_height, new_width = x.shape + upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear") + x = th.cat([x, upsampled], dim=1) + return super().forward(x, timesteps, **kwargs) + + +class EncoderUNetModel(nn.Module): + """ + The half UNet model with attention and timestep embedding. + + For usage, see UNet. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + use_checkpoint=False, + use_fp16=False, + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + pool="adaptive", + ): + super().__init__() + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + ch = int(channel_mult[0] * model_channels) + self.input_blocks = nn.ModuleList( + [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))] + ) + self._feature_size = ch + input_block_chans = [ch] + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=int(mult * model_channels), + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = int(mult * model_channels) + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) + if resblock_updown + else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch + ) + ) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + self.pool = pool + if pool == "adaptive": + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.AdaptiveAvgPool2d((1, 1)), + zero_module(conv_nd(dims, ch, out_channels, 1)), + nn.Flatten(), + ) + elif pool == "attention": + assert num_head_channels != -1 + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + AttentionPool2d( + (image_size // ds), ch, num_head_channels, out_channels + ), + ) + elif pool == "spatial": + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + nn.ReLU(), + nn.Linear(2048, self.out_channels), + ) + elif pool == "spatial_v2": + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + normalization(2048), + nn.SiLU(), + nn.Linear(2048, self.out_channels), + ) + else: + raise NotImplementedError(f"Unexpected {pool} pooling") + + def convert_to_fp16(self): + """ + Convert the torso of the model to float16. + """ + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """ + Convert the torso of the model to float32. + """ + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + + def forward(self, x, timesteps): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :return: an [N x K] Tensor of outputs. + """ + emb = self.time_embed(timestep_embedding(timesteps, self.model_channels)) + + results = [] + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb) + if self.pool.startswith("spatial"): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = self.middle_block(h, emb) + if self.pool.startswith("spatial"): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = th.cat(results, axis=-1) + return self.out(h) + else: + h = h.type(x.dtype) + return self.out(h) + + +class UNetConfig(PretrainedConfig): + def __init__( + self, + image_size=512, + in_channels=3, + model_channels=256, + out_channels=6, + num_res_blocks=2, + attention_resolutions=[16, 32, 64], + dropout=0.0, + channel_mult=(0.5, 1, 1, 2, 2, 4, 4), + num_classes=None, + use_checkpoint=False, + use_fp16=True, + num_heads=4, + num_head_channels=64, + num_heads_upsample=-1, + use_scale_shift_norm=True, + resblock_updown=True, + use_new_attention_order=False, + **kwargs + ): + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.use_fp16 = use_fp16 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + self.use_scale_shift_norm = use_scale_shift_norm + self.resblock_updown = resblock_updown + self.use_new_attention_order = use_new_attention_order + super().__init__(**kwargs) + + +class HFUNetModel(PreTrainedModel): + config_class = UNetConfig + + def __init__(self, config): + super().__init__(config) + self.model = UNetModel( + image_size=config.image_size, + in_channels=config.in_channels, + model_channels=config.model_channels, + out_channels=config.out_channels, + num_res_blocks=config.num_res_blocks, + attention_resolutions=config.attention_resolutions, + dropout=config.dropout, + channel_mult=config.channel_mult, + num_classes=config.num_classes, + use_checkpoint=config.use_checkpoint, + use_fp16=config.use_fp16, + num_heads=config.num_heads, + num_head_channels=config.num_head_channels, + num_heads_upsample=config.num_heads_upsample, + use_scale_shift_norm=config.use_scale_shift_norm, + resblock_updown=config.resblock_updown, + use_new_attention_order=config.use_new_attention_order, + ) + + def forward(self, x, timesteps, y=None): + return self.model.forward(x, timesteps, y) + + def convert_to_fp16(self): + """ + Convert the torso of the model to float16. + """ + self.model.input_blocks.apply(convert_module_to_f16) + self.model.middle_block.apply(convert_module_to_f16) + self.model.output_blocks.apply(convert_module_to_f16) diff --git a/fengshen/examples/disco_project/st_disco.py b/fengshen/examples/disco_project/st_disco.py new file mode 100644 index 0000000000000000000000000000000000000000..8676ac2763aab65300bdcb588ac74c4e672745d5 --- /dev/null +++ b/fengshen/examples/disco_project/st_disco.py @@ -0,0 +1,56 @@ +# from disco_huge import Diffuser +# from utils import * +from disco import Diffuser +import streamlit as st +from io import BytesIO +from PIL import Image +from disco import steps + + +@st.cache(show_spinner=False, allow_output_mutation=True) # 加装饰器, 只加载一次。 +class ST_Diffuser(Diffuser): + def __init__(self, custom_path): + super().__init__(custom_path) + + +if __name__ == '__main__': + dd = ST_Diffuser(custom_path="IDEA-CCNL/Taiyi-Diffusion-532M-Nature") # 初始化 + form = st.form("参数设置") + input_text = form.text_input('输入文本生成图像:', value='', placeholder='你想象的一个画面') + form.form_submit_button("提交") + uploaded_file = st.file_uploader("上传初始化图片(可选)", type=["jpg", "png", "jpeg"]) + + text_scale_norm = st.sidebar.slider('文本强度', 0.1, 1.0, 0.5, step=0.1) + text_scale = int(text_scale_norm * 10000) + res_skip_steps = st.sidebar.slider('加噪强度', 0.1, 1.0, 0.9, step=0.1) + skip_steps = int(steps - round(res_skip_steps * steps)) + width = st.sidebar.slider('宽度', 384, 1024, 512, step=64) + heigth = st.sidebar.slider('高度', 384, 1024, 512, step=64) + + with st.spinner('正在生成中...'): + capture_img = None + if uploaded_file is not None: + # To read file as bytes: + bytes_data = uploaded_file.getvalue() + # 将字节数据转化成字节流 + bytes_data = BytesIO(bytes_data) + # Image.open()可以读字节流 + capture_img = Image.open(bytes_data).convert('RGB').resize((width, heigth)) + + image_status = st.empty() + image_status.image(capture_img, use_column_width=True) + else: + image_status = st.empty() + + if input_text: + # global text_prompts + input_text_prompts = [input_text] + image = dd.generate(input_text_prompts, + capture_img, + clip_guidance_scale=text_scale, + skip_steps=skip_steps, + st_dynamic_image=image_status, + init_scale=None, + side_x=width, + side_y=heigth) # 最终结果。实时显示修改generate里面的内容。 + image_status.image(image, use_column_width=True) diff --git a/fengshen/examples/finetune_bart_qg/README.md b/fengshen/examples/finetune_bart_qg/README.md new file mode 100644 index 0000000000000000000000000000000000000000..33457b448b4356062ad4b1b00a22f00122c4fe83 --- /dev/null +++ b/fengshen/examples/finetune_bart_qg/README.md @@ -0,0 +1,106 @@ +## Randeng-BART-139M-QG-Chinese + + + +## 简介 Brief Introduction + +善于处理问题生成任务的中文版 BART-base 模型。 + +Good at solving question generation tasks Bart-base Model (Chinese version). + +## 模型分类 Model Taxonomy + +| 需求 Demand | 任务 Task | 系列 Series | 模型 Model | 参数 Parameter | 额外 Extra | +| :----: | :----: | :----: | :----: | :----: | :----: | +| 通用 General | 自然语言转换 NLT | 燃灯 Randeng | BART | 139M | 问题生成任务-中文 QuestionGeneration-Chinese | + + +## 模型信息 Model Information + +本模型基于[IDEA-CCNL/Randeng-BART-139M](https://huggingface.co./IDEA-CCNL/Randeng-BART-139M),我们在 [ChineseSQuAD](https://github.com/pluto-junzeng/ChineseSquad) 数据集上微调了问题生成任务版本。 + +Based on [IDEA-CCNL/Randeng-BART-139M](https://huggingface.co./IDEA-CCNL/Randeng-BART-139M), we fine-tuned a question generation version on [ChineseSQuAD](https://github.com/pluto-junzeng/ChineseSquad) datasets. + + +Table1: 模型结构和配置 Model Architecture and Config + +| 配置 Config | 参数 Value| +| ------------------- | --------- | +| encoder layers | 6 | +| encoder_attn_heads | 12 | +| encoder_ffn_dim | 3072 | +| decoder_layers | 6 | +| decoder_attn_heads | 12 | +| decoder_ffn_dim | 3072 | +| max_encoder_len | 512 | + + +ChineseSQuAD 数据集翻译了部分SQuAD数据集,包含约 67k 有答案的训练样本和 43k 无答案训练样本。我们做了 9:1 的训练-开发集合划分,并在公开的开发集上评测了效果。 + +The dataset is translated from SQuAD 2.0, with around 67k samples with answers and 43k samples without answers. We split the data to train-dev with ratio of 9:1 and test the performance on the public dev set. + +Table 2: 数据集样本量 +| | all | have ans | no ans | +|:------|:-------|:---------|:-------| +| train_split | 100097 | 60879 | 39128 | +| dev_split | 11089 | 6809 | 4280 | +| dev | 10836 | 6645 | 4191 | + + +## 使用 Usage + +### 环境安装 Install +``` +git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git +cd Fengshenbang-LM +git submodule init +git submodule update +# submodule是我们用来管理数据集的fs_datasets,通过ssh的方式拉取,如果用户没有在机器上配置ssh-key的话可能会拉取失败。 +# 如果拉取失败,需要到.gitmodules文件中把ssh地址改为https地址即可。 +pip install --editable . +``` + + +### 模型加载 Loading Models +```python +from transformers import AutoTokenizer, BartForConditionalGeneration +tokenizer = AutoTokenizer.from_pretrained("IDEA-CCNL/Randeng-BART-139M-QG-Chinese",additional_special_tokens=[""]) +model = BartForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-BART-139M-QG-Chinese") + +context = "知识:1939年9月1日德国入侵波兰后,第二次世界大战开始,华沙一直被保卫到9月27日。波兰中部,包括华沙,都在德国纳粹殖民地政府总政府的统治下。所有的高等教育机构都立即关闭,华沙的犹太人口——几十万,约占城市的 ——全部涌入华沙的贫民区。回答:30%" +inputs = tokenizer.encode_plus( + context, + max_length=448, + padding="max_length", + truncation=True, + return_tensors='pt' + ) +out = model.generate( + input_ids=inputs['input_ids'], + attention_mask=inputs['attention_mask'], + do_sample=True, + num_beams=5, + max_length=64, + top_p = 0.9, + ) +print(pred = tokenizer.batch_decode(out,clean_up_tokenization_spaces=True, skip_special_tokens=True)[0]) +# 问题:华沙的犹太人口占城市的百分之多少? +``` + + + +### 训练 train +```python +bash finetune_bart.sh +``` + +- finetune_bart.py 定义了数据处理输入输出方式和finetune的核心代码 +- finetune_bart.sh 训练脚本,具体参数可在此修改 +- utils.py 定义了独立的工具代码,重实现的函数等 + + + +### 下游效果 Performance +| Dataset | Size | BLEU-4 | METEOR | ROUGE-L| +| ------------ | ----- | -------- |--------- | ---------- | +| ChineseSQuAD | 139M | 22.17 | 40.38 | 38.17 | diff --git a/fengshen/examples/finetune_bart_qg/finetune_bart.py b/fengshen/examples/finetune_bart_qg/finetune_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..d2c64589edf6c146632e656c96b7195d4ae87d81 --- /dev/null +++ b/fengshen/examples/finetune_bart_qg/finetune_bart.py @@ -0,0 +1,429 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : finetune_bart.py +@Time : 2022/10/28 18:23 +@Author : Qi Yang +@Version : 1.0 +@Contact : yangqi@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' + + +from fengshen.models.model_utils import configure_optimizers +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from fengshen.utils import chinese_char_tokenize +from utils import truncate_sequence, white_space_fix +from utils import LabelSmoothingCrossEntropy +import sys +import os +import torch +import argparse +import pytorch_lightning as pl +from dataclasses import dataclass +from pytorch_lightning import Trainer +from pytorch_lightning.callbacks import LearningRateMonitor +from transformers import BartForConditionalGeneration +from transformers import BertTokenizer, AutoTokenizer +from torchmetrics.text.rouge import ROUGEScore +sys.path.append('../../../') + + +@dataclass +class QGT5Collator: + @ staticmethod + def add_data_specific_args(parent_args): + # the hyperparameters should be determined according to the max length of context in dataset + parser = parent_args.add_argument_group('BART DIalo Collator') + parser.add_argument('--max_seq_length', default=512, type=int) + parser.add_argument('--max_src_length', default=32, type=int) + parser.add_argument('--max_kno_length', default=416, type=int) + parser.add_argument('--max_tgt_length', default=64, type=int) + parser.add_argument('--mask_ans_style', + default='normal', + type=str, + choices=['normal', 'unmask', 'anstoken', 'postag', 'anstoken_multispan', 'postag_multispan', 'normal_multispan']) + return parent_args + + def __init__(self, tokenizer, args): + self.args = args + self.tokenizer = tokenizer + self.max_seq_length = args.max_seq_length + self.print_example = True + self.mask_ans_style = args.mask_ans_style + self.do_eval_only = args.do_eval_only + self.tokenizer_type = args.tokenizer_type + + def encode(self, x, y): + if self.tokenizer_type == "bert": + x = x + y = y + else: + # t5 sentence piece + x = self.tokenizer.bos_token + x + self.tokenizer.eos_token + y = y + self.tokenizer.eos_token + + encoder_input = self.tokenizer.encode_plus( + x, + max_length=self.args.max_kno_length + self.args.max_src_length, + padding="max_length", + truncation=True, + return_tensors='pt' + ) + decoder_output = self.tokenizer.encode_plus( + y, + max_length=self.args.max_tgt_length, + padding="max_length", + truncation=True, + return_tensors='pt' + ) + + return encoder_input, decoder_output + + def mask(self, s): + def replace_span(source, target, sptoken): + ans_bos, ans_eos = s["ans_span"][0] + return source[:ans_bos] + sptoken + source[ans_eos:] + + def replace_all(source, target, sptoken): + return source.replace(target, sptoken) + + if 'multispan' in self.mask_ans_style: + fn = replace_all + else: + fn = replace_span + + # unmask: 北京是中国的首都 + if 'unmask' in self.mask_ans_style: + return s["context"] + + # normal: 北京是 的首都 + if 'normal' in self.mask_ans_style: + self.anstoken = self.tokenizer.mask_token + masked_context = fn(s["context"], s["answer"][0], self.anstoken) + return masked_context + + # anstoken: 北京是 [ANS] 的首都 + if 'anstoken' in self.mask_ans_style: + anstoken_dict = { + "bert": "[ANS]", + "bart": "" + } + self.anstoken = anstoken_dict[self.tokenizer_type] + masked_context = fn(s["context"], s["answer"][0], self.anstoken) + return masked_context + + # postag: 北京是 中国 的首都 + if 'postag' in self.mask_ans_style: + begtoken, endtoken = "", "" + self.anstoken = begtoken + s["answer"][0] + endtoken + masked_context = fn(s["context"], s["answer"][0], self.anstoken) + return masked_context + + return masked_context + + def prompt(self, context, answer, question): + pre_prompt, mid_prompt, post_prompt = "知识:", "回答:", "问题:" # prompt + + context = truncate_sequence(context, self.args.max_kno_length-len(pre_prompt)-1) + + # used in squad-2.0 + # noted that src and tgt is reversed in qg + answer = truncate_sequence(answer, self.args.max_src_length - len(mid_prompt)-1) + question = truncate_sequence(question, self.args.max_tgt_length-len(post_prompt)-1) + + x_trunc = f'{pre_prompt}{context}{mid_prompt}{answer}' + y_trunc = f'{post_prompt}{question}' + return x_trunc, y_trunc + + def __call__(self, samples): + """ + ans_num = 1 适用于 Train 数据只有 1 条 answer 取第一条情况 + ans_num > 1 适用于 Dev 数据有多条 answer 情况 + Input: + input_ids: input_ids (text + answer) + attn_mask: input attn mask + labels: decoder_ids (question) + """ + input_ids, attn_mask, labels = [], [], [] + ans, qes, ctx, ans_spans, idxs, imp = [], [], [], [], [], [] + + for s in samples: + if self.do_eval_only: + # log origin answer to compare + ans.append(s["answer"]) + qes.append(s["question"]) + ctx.append(s["context"]) + ans_spans.append(s["ans_span"]) + idxs.append(s["idx"]) + + if "is_impossible" in s: + imp.append(s["is_impossible"]) + else: + imp.append(False) # SQUAD 1.0 don't have is_impossible + + if not s["is_impossible"]: # have ans and ans_span + context = self.mask(s) + answer = s["answer"][0] + question = s["question"] + else: # no ans and ans_span + context = s["context"] + answer = "无答案" + question = s["question"] + + x_trunc, y_trunc = self.prompt(context, answer, question) + encoder_input, decoder_output = self.encode(x_trunc, y_trunc) + + input_ids.append(encoder_input["input_ids"]) + attn_mask.append(encoder_input["attention_mask"]) + labels.append(decoder_output["input_ids"]) + + labels = torch.cat(labels) + if self.tokenizer_type == "bart": + end_token_index = torch.where(labels == self.tokenizer.eos_token_id)[1] + else: + end_token_index = torch.where(labels == self.tokenizer.sep_token_id)[1] + for idx, end_idx in enumerate(end_token_index): + labels[idx][end_idx + 1:] = -100 # cross entropy cal + + data = { + 'input_ids': torch.cat(input_ids), + 'attention_mask': torch.cat(attn_mask), + 'labels': labels + } + if self.do_eval_only: + data.update({ + 'answer': ans, + 'question': qes, + 'context': ctx, + 'ans_span': ans_spans, + 'idx': idxs, + 'is_impossible': imp + }) + + if self.print_example: + print(x_trunc) + print(y_trunc) + self.print_example = False + + return data + + +class BARTFinetuneModel(pl.LightningModule): + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--model_path', type=str, default='') + parser.add_argument('--learning_rate', default=1e-5, type=float) + parser.add_argument('--min_learning_rate', default=1e-7, type=float) + parser.add_argument('--lr_decay_steps', default=0, type=int) + parser.add_argument('--lr_decay_ratio', default=1.0, type=float) + parser.add_argument('--weight_decay', default=0.1, type=float) + parser.add_argument('--warmup_steps', default=1000, type=int) + parser.add_argument('--warmup_ratio', default=0.01, type=float) + parser.add_argument('--label_smooth', default=0, type=float) + parser.add_argument('--new_token_path', default="./", type=str) # save new token after add special token + parser.add_argument('--adam_beta1', default=0.9, type=float) + parser.add_argument('--adam_beta2', default=0.999, type=float) + parser.add_argument('--adam_epsilon', default=1e-8, type=float) + parser.add_argument('--scheduler_type', default='polynomial', type=str) + + return parent_args + + def __init__(self, tokenizer, args): + super().__init__() + self.save_hyperparameters(args) + self.model = BartForConditionalGeneration.from_pretrained(args.model_path) + self.tokenizer = tokenizer + + # add special token ans + # self.tokenizer.save_vocabulary(self.args.model_path) + new_vocab = args.model_path+"/sp_vocab/" + if not os.path.exists(new_vocab): + os.makedirs(new_vocab) + self.tokenizer.save_pretrained(new_vocab) + self.model.resize_token_embeddings(len(tokenizer)) + self.vocab_size = len(tokenizer) + self.rougescore = ROUGEScore(rouge_keys=('rougeL'), normalizer=lambda x: x) + + if self.hparams.label_smooth: + self.loss_fct = LabelSmoothingCrossEntropy(smoothing=0.1) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs) + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + return configure_optimizers(self) + + def training_step(self, batch, batch_idx): + output = self.model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=batch['labels']) + + loss = output.loss + if self.hparams.label_smooth: + loss = self.loss_fct(output.logits.view(-1, self.vocab_size), batch["labels"].view(-1)) + + self.log('train_loss', loss, sync_dist=True) + return loss + + def validation_step(self, batch, batch_idx): + output = self.model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=batch['labels']) + acc = self.compute_acc(output.logits, batch['labels']) + self.log('val_loss', output.loss, sync_dist=True) + self.log('val_acc', acc, sync_dist=True) + self.log('val_ppl', torch.exp(output.loss), sync_dist=True) + + cond_output = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + do_sample=True, + num_beams=5, + early_stopping=True, + max_length=64, + top_p=0.9, + ) + + batch_label = torch.where(batch["labels"] != -100, batch["labels"], self.tokenizer.pad_token_id) + pred = self.tokenizer.batch_decode(cond_output, clean_up_tokenization_spaces=True, skip_special_tokens=True) + ques = self.tokenizer.batch_decode(batch_label, clean_up_tokenization_spaces=True, skip_special_tokens=True) + + pred = [chinese_char_tokenize(white_space_fix(p)) for p in pred] + ques = [chinese_char_tokenize(white_space_fix(q)) for q in ques] + self.rougescore.update(pred, ques) + + return pred + + def validation_epoch_end(self, validation_step_outputs): + rouge = self.rougescore.compute() + self.log('val_rouge', rouge["rougeL_fmeasure"], sync_dist=True) + + def on_predict_start(self): + self.loss_fct = torch.nn.CrossEntropyLoss(reduction='none') + + def predict_step(self, batch, batch_idx): + output = self.model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=batch['labels']) + + loss_tensor = self.loss_fct(output.logits.transpose(1, 2), batch["labels"]) + if self.hparams.tokenizer_type == 'bart': + eos_index = torch.where(batch['labels'] == self.tokenizer.eos_token_id)[1] + elif self.hparams.tokenizer_type == 'bert': + eos_index = torch.where(batch['labels'] == self.tokenizer.sep_token_id)[1] + + loss = torch.sum(loss_tensor, dim=1) / eos_index + + with torch.no_grad(): + cond_output = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + do_sample=True, + num_beams=5, + max_length=64, + top_p=0.9, + output_scores=True, + return_dict_in_generate=True + ) + + pred = self.tokenizer.batch_decode( + cond_output.sequences, clean_up_tokenization_spaces=True, skip_special_tokens=True) # ['sequences'] + pred = [white_space_fix(p) for p in pred] # remove prompt and white space + score = cond_output.sequences_scores + return pred, score, loss + + def compute_acc(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/y_true.shape[0] + return acc + + def on_save_checkpoint(self, checkpoint) -> None: + if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0: + self.model.save_pretrained(os.path.join( + self.trainer.checkpoint_callback.dirpath, + 'hf_pretrained_epoch{}_step{}'.format(checkpoint['epoch'], checkpoint['global_step']))) + + def on_load_checkpoint(self, checkpoint) -> None: + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +def get_tokenizer(tokenizer_type, pretrained_model_path): + if tokenizer_type == 'bart': + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_path, use_fast=False, additional_special_tokens=["", "", ""]) + print(len(tokenizer)) + elif tokenizer_type == 'bert': + tokenizer = BertTokenizer.from_pretrained( + pretrained_model_path, use_fast=False, additional_special_tokens=["[ANS]"]) + return tokenizer + + +def main(): + total_parser = argparse.ArgumentParser("Finetune BART for QG") + total_parser.add_argument('--do_eval_only', action='store_true', default=False) + total_parser.add_argument('--tokenizer_type', type=str, default="bart", choices=['bart', 'bert']) + total_parser.add_argument('--tensorboard_dir', type=str, default="bart") + total_parser.add_argument('--deepspeed') + + total_parser = UniversalDataModule.add_data_specific_args(total_parser) + total_parser = QGT5Collator.add_data_specific_args(total_parser) + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = UniversalCheckpoint.add_argparse_args(total_parser) + total_parser = BARTFinetuneModel.add_model_specific_args(total_parser) + args = total_parser.parse_args() + + tokenizer = get_tokenizer(args.tokenizer_type, args.model_path) + collator = QGT5Collator(tokenizer=tokenizer, args=args) + data_model = UniversalDataModule(collate_fn=collator, tokenizer=tokenizer, args=args) + print("Data load complete...") + + if args.deepspeed is not None: + os.environ['PL_DEEPSPEED_CONFIG_PATH'] = args.deepspeed + + model = BARTFinetuneModel(tokenizer, args) + checkpoint_callback = UniversalCheckpoint(args) + lr_monitor = LearningRateMonitor(logging_interval='step') + trainer = Trainer.from_argparse_args(args, + callbacks=[checkpoint_callback, lr_monitor] + ) + + if not args.do_eval_only: + trainer.fit(model, data_model) + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/finetune_bart_qg/finetune_bart.sh b/fengshen/examples/finetune_bart_qg/finetune_bart.sh new file mode 100644 index 0000000000000000000000000000000000000000..ae88b230fa223c3d2c519e4f09cb1c703319af48 --- /dev/null +++ b/fengshen/examples/finetune_bart_qg/finetune_bart.sh @@ -0,0 +1,97 @@ +#!/bin/bash +#SBATCH --job-name=bart_qg # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=10 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +set -x -e + +MODEL_NAME=IDEA-CCNL/Randeng-BART-139M +RUN_NAME=bart_v0_test +ROOT_DIR=../../workspace/log/$RUN_NAME + +config_json="$ROOT_DIR/$MODEL_NAME.ds_config.json" +export MASTER_PORT=$[RANDOM%10000+40000] + +MICRO_BATCH_SIZE=32 + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "gradient_clipping": 1, + "zero_optimization": { + "stage": 1 + }, + "fp16": { + "enabled": true, + } +} +EOT +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=../../workspace/torch_extensions + +DATA_ARGS=" \ + --train_file train.json \ + --val_file dev.json \ + --test_file test.json \ + --tokenizer_type bart \ + --num_workers 8 \ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --max_seq_lengt 512 \ + --max_src_length 32 \ + --max_kno_length 416 \ + --max_tgt_length 64 \ + --mask_ans_style anstoken_multispan \ + " + +MODEL_ARGS="\ + --model_path $MODEL_NAME/ \ + --learning_rate 1e-4 \ + --min_learning_rate 1e-8 \ + --lr_decay_steps 100000 \ + --weight_decay 1e-2 \ + --warmup_steps 1000 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_loss \ + --save_top_k 3 \ + --mode min \ + --save_last \ + --every_n_train_steps 5000 \ + --save_ckpt_path $ROOT_DIR/ckpt/ \ + --load_ckpt_path $ROOT_DIR/ckpt/ \ + --filename model-{step:02d}-{train_loss:.4f} \ + " + +TRAINER_ARGS="\ + --gradient_clip_val 1.0 \ + --max_epochs 1 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy ddp \ + --log_every_n_steps 100 \ + --val_check_interval 0.5 \ + --accumulate_grad_batches 1 \ + --default_root_dir $ROOT_DIR \ + --tensorboard_dir $ROOT_DIR \ + --label_smooth 0.1 \ + " + + + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " +# test +export SCRIPT_PATH=./finetune_bart.py + +python3 ${SCRIPT_PATH} $options > $ROOT_DIR/test.log + diff --git a/fengshen/examples/finetune_bart_qg/utils.py b/fengshen/examples/finetune_bart_qg/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..25cc1ef54673d3e7a465901eb905c4889f1397fd --- /dev/null +++ b/fengshen/examples/finetune_bart_qg/utils.py @@ -0,0 +1,70 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : utils.py +@Time : 2022/10/28 18:27 +@Author : Qi Yang +@Version : 1.0 +@Contact : yangqi@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn.functional as F + + +class LabelSmoothingCrossEntropy(torch.nn.Module): + def __init__(self, smoothing=0.1): + super(LabelSmoothingCrossEntropy, self).__init__() + self.smoothing = smoothing + self.ignore_index = -100 + + def forward(self, x, target): + confidence = 1. - self.smoothing + logprobs = F.log_softmax(x, dim=-1) + targets_ignore = torch.where(target != self.ignore_index, target, 0) + nll_loss = -logprobs.gather(dim=-1, index=targets_ignore.unsqueeze(1)) + nll_loss = nll_loss.squeeze(1) + smooth_loss = -logprobs.mean(dim=-1) + loss = confidence * nll_loss + self.smoothing * smooth_loss + return loss.mean() + + +def truncate_sequence(document: str, max_num_tokens: int, reverse=False): + total_length = len(document) + if total_length <= max_num_tokens: + return document + else: + if reverse: + return document[-1*max_num_tokens:] + else: + return document[:max_num_tokens] + + +def padding_to_maxlength(ids, max_length, pad_id): + cur_len = len(ids) + len_diff = max_length - len(ids) + return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff + + +def white_space_fix(text): + return "".join(text.split(" ")) + + +def remove_prompt(text): + if ":" in text: + return text.split(":")[1] + return text diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2b6037b10173b0f6f03563b7df8e0378821fb18f Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.jpg differ diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.txt new file mode 100644 index 0000000000000000000000000000000000000000..97b1b5ec990c1574dcdf9743392ef543e044e3ee --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000001.txt @@ -0,0 +1 @@ +牛津高阶英汉双解词典 (第6版)(内容一致,印次、封面或原价不同,统一售价,随机发货 \ No newline at end of file diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2c70487b1ae5803d8249a56130b0cfd6fbf0d722 Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.jpg differ diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.txt new file mode 100644 index 0000000000000000000000000000000000000000..14810ab1fa32db2e3d225a3aa2b5a6280ae596ea --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000002.txt @@ -0,0 +1 @@ +照相机显示走和做购物的愉快的人民 股票视频 \ No newline at end of file diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a9c82a86204a28a28fcb5d2e6276cb677cc2a26f --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18890cf53a412387ef810badb41917351f010920f49f07ccadcce6f0e990d29 +size 2083013 diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d1696755083f4b975a08e9521ab7677438b83e1 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000003.txt @@ -0,0 +1 @@ +直升机战争VR图4 \ No newline at end of file diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..575d4da8cca0bb2e250352d0e0914bcd65d1886c Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.jpg differ diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.txt new file mode 100644 index 0000000000000000000000000000000000000000..43b4695f6b60f8183b94e50129e92915e9571cf3 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000004.txt @@ -0,0 +1 @@ +彩绘百合花图片 \ No newline at end of file diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c99776bb2c9475923a4fbde0716397789741cce1 Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.jpg differ diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.txt new file mode 100644 index 0000000000000000000000000000000000000000..890c536b1098772268d01a7760e79c89c646f55b --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000005.txt @@ -0,0 +1 @@ +用与巧克力蛋糕的正确的新月形面包和在灰色木背景的一个桔子,其次洒与桃红色 图库摄影 \ No newline at end of file diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.jpg b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1bd7b79ebaaa257fbfac8af4f015dc83c5a038e2 Binary files /dev/null and b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.jpg differ diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.txt b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.txt new file mode 100644 index 0000000000000000000000000000000000000000..d98543289903785609bc0d1d878321a69b80b231 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset/part_0/00000006.txt @@ -0,0 +1 @@ +可燃气体油管 库存图片 \ No newline at end of file diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate.sh b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate.sh new file mode 100644 index 0000000000000000000000000000000000000000..8b7d5412f7bd75cb0700cca0699e029a022db7a7 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --job-name=evaluate_model # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=1 # number of tasks to run per node +#SBATCH --cpus-per-task=5 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH -o inference_log/%x-%j.log # output and error log file names (%x for job id) +#SBATCH -p batch +#SBATCH --qos=ai4cogsys + +export SCRIPT_PATH=./evaluate_model.py + +MODEL_PATH='' + +srun python $SCRIPT_PATH $MODEL_PATH \ No newline at end of file diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate_model.py b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b92b6a1e29bb31af553fd2e924a0a2b0dcdb4873 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/evaluate_model.py @@ -0,0 +1,294 @@ +import pytorch_lightning as pl +import torch.nn as nn +import torch.nn.functional as F +import torch +import timm +from torchvision import transforms as T +import open_clip +import sys +import torch +import json +from transformers import BertModel, BertTokenizer +from PIL import Image +from diffusers import StableDiffusionPipeline +import random +import os +from tqdm import tqdm + +os.environ['CUDA_LAUNCH_BLOCKING']='1' +torch.backends.cudnn.benchmark = True + +class AestheticsMLP(pl.LightningModule): + # 美学判别器是基于CLIP的基础上接了一个MLP + def __init__(self, input_size, xcol='emb', ycol='avg_rating'): + super().__init__() + self.input_size = input_size + self.xcol = xcol + self.ycol = ycol + self.layers = nn.Sequential( + nn.Linear(self.input_size, 1024), + #nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(1024, 128), + #nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(128, 64), + #nn.ReLU(), + nn.Dropout(0.1), + + nn.Linear(64, 16), + #nn.ReLU(), + + nn.Linear(16, 1) + ) + + def forward(self, x): + return self.layers(x) + + def training_step(self, batch, batch_idx): + x = batch[self.xcol] + y = batch[self.ycol].reshape(-1, 1) + x_hat = self.layers(x) + loss = F.mse_loss(x_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x = batch[self.xcol] + y = batch[self.ycol].reshape(-1, 1) + x_hat = self.layers(x) + loss = F.mse_loss(x_hat, y) + return loss + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) + return optimizer + + +class WaterMarkModel(nn.Module): + def __init__(self, model_path='./watermark_model_v1.pt'): + super(WaterMarkModel, self).__init__() + # model definition + self.model = timm.create_model( + 'efficientnet_b3a', pretrained=True, num_classes=2) + + self.model.classifier = nn.Sequential( + # 1536 is the orginal in_features + nn.Linear(in_features=1536, out_features=625), + nn.ReLU(), # ReLu to be the activation function + nn.Dropout(p=0.3), + nn.Linear(in_features=625, out_features=256), + nn.ReLU(), + nn.Linear(in_features=256, out_features=2), + ) + self.model.load_state_dict(torch.load(model_path)) + def forward(self, x): + return self.model(x) + +class FilterSystem: + def __init__( + self, + clip_model_path="IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese", + aesthetics_model_path="./ava+logos-l14-linearMSE.pth", + watermark_model_path="./watermark_model_v1.pt" + ): + self.clip_model_path = clip_model_path + self.aesthetics_model_path = aesthetics_model_path + self.watermark_model_path = watermark_model_path + self.init_aesthetics_model() + self.init_clip_model() + self.init_watermark_model() + + def init_clip_model(self, ): + # 此处初始化clip模型,返回模型、tokenizer、processor + text_encoder = BertModel.from_pretrained(self.clip_model_path).eval().cuda() + text_tokenizer = BertTokenizer.from_pretrained(self.clip_model_path) + clip_model, _, processor = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai') + clip_model = clip_model.eval().cuda() + self.text_encoder, self.text_tokenizer, self.clip_model, self.processor = text_encoder, text_tokenizer, clip_model, processor + print("clip model loaded") + return None + + def init_aesthetics_model(self, ): + # 此处初始化美学模型 + self.aesthetics_model = AestheticsMLP(768) + self.aesthetics_model.load_state_dict(torch.load(self.aesthetics_model_path)) + self.aesthetics_model.eval().cuda() + print("aesthetics model loaded") + return None + + def init_watermark_model(self, ): + self.watermark_model = WaterMarkModel(self.watermark_model_path) + self.watermark_model.eval().cuda() + self.watermark_processor = T.Compose([ + T.Resize((256, 256)), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + print("watermark model loaded") + return None + + def get_image_feature(self, images): + # 此处返回图像的特征向量 + if isinstance(images, list): + images = torch.stack([self.processor(image) for image in images]).cuda() + elif isinstance(images, torch.Tensor): + images = images.cuda() + else: + images = self.processor(images).cuda() + + with torch.no_grad(): + image_features = self.clip_model.encode_image(images) + image_features /= image_features.norm(dim=1, keepdim=True) + return image_features + + def get_text_feature(self, text): + # 此处返回文本的特征向量 + if isinstance(text, list) or isinstance(text, str): + text = self.text_tokenizer(text, return_tensors='pt', padding=True)['input_ids'].cuda() + elif isinstance(text, torch.Tensor): + text = text.cuda() + + with torch.no_grad(): + text_features = self.text_encoder(text)[1] + text_features /= text_features.norm(dim=1, keepdim=True) + return text_features + + def calculate_clip_score(self, features1, features2): + # 此处2个特征向量的相似度,输入可以是 图片+文本、文本+文本、图片+图片。 + # 返回的是相似度矩阵,维度为 f1.shape[0] * f2.shape[0] + score_matrix = features1 @ features2.t() + return score_matrix + + def get_clip_score(self, text, image): + text_feature = self.get_text_feature(text) + image_feature = self.get_image_feature(image) + return self.calculate_clip_score(text_feature, image_feature) + + def get_aesthetics_score(self, features): + # 此处返回美学分数,传入的是CLIP的feature, 先计算get_image_feature在传入此函数~(模型是ViT-L-14) + with torch.no_grad(): + scores = self.aesthetics_model(features) + scores = scores[:, 0].detach().cpu().numpy() + return scores + + def get_watermark_score(self, images): + if isinstance(images, list): + images = torch.stack([self.watermark_processor(image) for image in images]).cuda() + elif isinstance(images, torch.Tensor): + images = images.cuda() + with torch.no_grad(): + pred = self.watermark_model(images) + watermark_scores = F.softmax(pred, dim=1)[:,0].detach().cpu().numpy() + + return watermark_scores + +class InferenceFlickr: + def __init__(self, sd_model_list, sample_num=20, guidance_scale=7.5, test_caption_path="/cognitive_comp/chenweifeng/project/dataset/mm_data/Flickr30k-CNA/test/flickr30k_cn_test.txt"): + self.model_name_list = sd_model_list + self.guidance_scale = guidance_scale + self.sample_num=sample_num + self.score_model = FilterSystem() + self.caption_path = test_caption_path + self.score = dict() + self.final_score = dict() + + def init_model(self): + self.model_list = [] + for model_name in self.model_name_list: + pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=torch.float16).to("cuda") + self.model_list.append(pipe) + + def generate_image_score(self, prompt_list, model_list): + generator = torch.Generator(device=0) + generator = generator.manual_seed(42) + # num_images = 1 + # latents = None + # seeds = [] + # for _ in range(num_images): + # generator = generator.manual_seed(42) + + # image_latents = torch.randn( + # (1, pipe.unet.in_channels, 512 // 8, 512 // 8), + # generator = generator, + # device =1 + # ) + # latents = image_latents if latents is None else torch.cat((latents, image_latents)) + for i, model in enumerate(model_list): + model_name = self.model_name_list[i] + self.score[model_name] = dict() + for j, prompt in tqdm(enumerate(prompt_list)): + latents = None + image_latents = torch.randn( + (1, model.unet.in_channels, 512 // 8, 512 // 8), + generator = generator, + device =0, + dtype=torch.float16 + ) + latents = image_latents if latents is None else torch.cat((latents, image_latents)) + image = model(prompt, guidance_scale=self.guidance_scale, latents=latents, torch_dtype=torch.float16).images[0] + image_feature = self.score_model.get_image_feature([image]) + text_feature = self.score_model.get_text_feature(prompt) + image_clip_score = self.score_model.calculate_clip_score(image_feature, text_feature) + image_watermark_score = self.score_model.get_watermark_score([image]) + image_aesthetics_score =self.score_model.get_aesthetics_score(image_feature) + self.score[model_name][prompt] = { + "clip_score": float(image_clip_score[0][0]), + "watermark_score": float(image_watermark_score[0]), + "aesthetics_score": float(image_aesthetics_score[0]), + } + image.save(f"tmp/{prompt}_model-{str(i)}.png") + + def get_prompt_list(self, seed=42, ): + with open(self.caption_path) as fin: + input_lines = fin.readlines() + tmp_list = [] + for line in input_lines: + infos = line.strip('\n').split('\t') + prompt = infos[1] + tmp_list.append(prompt) + random.seed(seed) + prompt_list = random.sample(tmp_list, self.sample_num) + return prompt_list + + def run(self): + self.init_model() + prompt_list = self.get_prompt_list() + self.generate_image_score(prompt_list, self.model_list) + + def show(self, save_path=None): + # print(self.score) + print(self.final_score) + if save_path: + with open(save_path, 'w') as fout: + json.dump(fout, self.final_score, indent=2, ensure_ascii=False) + + def calculate_score(self,): + for model_name in self.score.keys(): + clip_score = 0.0 + watermark_score = 0.0 + aesthetics_score = 0.0 + for prompt in self.score[model_name]: + clip_score += self.score[model_name][prompt]['clip_score'] + watermark_score += self.score[model_name][prompt]['watermark_score'] + aesthetics_score += self.score[model_name][prompt]['aesthetics_score'] + average_clip_score = clip_score / len(self.score[model_name].keys()) + average_watermark_score = watermark_score / len(self.score[model_name].keys()) + average_aesthetics_score = aesthetics_score / len(self.score[model_name].keys()) + self.final_score[model_name] = {"avg_clip": average_clip_score, "avg_watermark": average_watermark_score, 'avg_aesthetics': average_aesthetics_score} + +def main(): + model_path = sys.argv[1] + model_list = [ + # '/cognitive_comp/chenweifeng/project/stable-diffusion-lightning/finetune_taiyi_v0.40_laion', + # '/cognitive_comp/chenweifeng/project/stable-diffusion-chinese/finetune_taiyi0' + # "/cognitive_comp/lixiayu/diffuser_models/wukong_epoch1" + # "/cognitive_comp/lixiayu/work/Fengshenbang-LM/fengshen/workspace/taiyi-stablediffusion-laion/60per_ckpt", + model_path + ] + score_model = InferenceFlickr(model_list, sample_num=1000) + score_model.run() + score_model.calculate_score() + score_model.show() + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.py b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f27358402cd0de23353acf6eaedf247949ec0a --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.py @@ -0,0 +1,188 @@ +import os +import torch +import argparse +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from transformers import BertTokenizer, BertModel +from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel +from torch.nn import functional as F +from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data +from torchvision import transforms +from PIL import Image +from torch.utils.data._utils.collate import default_collate + + +class Collator(): + def __init__(self, args, tokenizer): + self.image_transforms = transforms.Compose( + [ + transforms.Resize( + args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop( + args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + self.tokenizer = tokenizer + + def __call__(self, inputs): + examples = [] + max_length = min(max([len(i['caption']) for i in inputs]), 512) + for i in inputs: + example = {} + instance_image = Image.open(i['img_path']) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["pixel_values"] = self.image_transforms(instance_image) + example["input_ids"] = self.tokenizer( + i['caption'], + padding="max_length", + truncation=True, + max_length=max_length, + return_tensors='pt', + )['input_ids'][0] + examples.append(example) + return default_collate(examples) + + +class StableDiffusion(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Taiyi Stable Diffusion Module') + parser.add_argument('--freeze_unet', action='store_true', default=False) + parser.add_argument('--freeze_text_encoder', action='store_true', default=False) + return parent_parser + + def __init__(self, args): + super().__init__() + self.tokenizer = BertTokenizer.from_pretrained( + args.model_path, subfolder="tokenizer") + self.text_encoder = BertModel.from_pretrained( + args.model_path, subfolder="text_encoder") # load from taiyi_finetune-v0 + self.vae = AutoencoderKL.from_pretrained( + args.model_path, subfolder="vae") + self.unet = UNet2DConditionModel.from_pretrained( + args.model_path, subfolder="unet") + # TODO: 使用xformers配合deepspeed速度反而有下降(待确认 + self.unet.set_use_memory_efficient_attention_xformers(False) + + self.noise_scheduler = DDPMScheduler( + beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000 + ) + + for param in self.vae.parameters(): + param.requires_grad = False + + if args.freeze_text_encoder: + for param in self.text_encoder.parameters(): + param.requires_grad = False + + if args.freeze_unet: + for param in self.unet.parameters(): + param.requires_grad = False + + self.save_hyperparameters(args) + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + return configure_optimizers(self) + + def training_step(self, batch, batch_idx): + self.text_encoder.train() + + latents = self.vae.encode(batch["pixel_values"]).latent_dist.sample() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn(latents.shape).to(latents.device) + noise = noise.to(dtype=self.unet.dtype) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) + timesteps = timesteps.long() + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + + noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps) + noisy_latents = noisy_latents.to(dtype=self.unet.dtype) + + # Get the text embedding for conditioning + encoder_hidden_states = self.text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample + + loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean() + self.log("train_loss", loss.item(), on_epoch=False, prog_bar=True, logger=True) + + if self.trainer.global_rank == 0 and self.global_step == 100: + # 打印显存占用 + from fengshen.utils.utils import report_memory + report_memory('stable diffusion') + + return {"loss": loss} + + def on_save_checkpoint(self, checkpoint) -> None: + if self.trainer.global_rank == 0: + print('saving model...') + pipeline = StableDiffusionPipeline.from_pretrained( + self.hparams.model_path, + text_encoder=self.text_encoder, + tokenizer=self.tokenizer, + unet=self.unet) + self.trainer.current_epoch + pipeline.save_pretrained(os.path.join( + args.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}')) + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = add_data_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = StableDiffusion.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + trainer = Trainer.from_argparse_args(args, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + model = StableDiffusion(args) + tokenizer = model.tokenizer + datasets = load_data(args, global_rank=trainer.global_rank) + collate_fn = Collator(args, tokenizer) + + datamoule = UniversalDataModule( + tokenizer=tokenizer, collate_fn=collate_fn, args=args, datasets=datasets) + + trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.sh b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..5e6dab37e15c396776da02e9c549c048dff6f259 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune.sh @@ -0,0 +1,88 @@ +#!/bin/bash +#SBATCH --job-name=finetune_taiyi # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen +ROOT_DIR=../../workspace +export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions + +MODEL_NAME=taiyi-stablediffusion-1B +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} +fi + +NNODES=1 +GPUS_PER_NODE=1 + +MICRO_BATCH_SIZE=1 + +# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin +CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json" +ZERO_STAGE=1 +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $CONFIG_JSON +{ + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "bf16": { + "enabled": true + }, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE +} +EOT +export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON +### End + +DATA_ARGS="\ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --datasets_path ./demo_dataset \ + --datasets_type txt \ + --resolution 512 \ + " + +MODEL_ARGS="\ + --model_path IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 \ + --learning_rate 1e-4 \ + --weight_decay 1e-1 \ + --warmup_ratio 0.01 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --save_last \ + --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ + --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ + " + +TRAINER_ARGS="\ + --max_epoch 10 \ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --log_every_n_steps 100 \ + --precision bf16 \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --replace_sampler_ddp False \ + --num_sanity_val_steps 0 \ + --limit_val_batches 0 \ + " +# num_sanity_val_steps, limit_val_batches 通过这俩参数把validation关了 + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +python3 finetune.py $options +#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain_deberta.py $options diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..94d4593aa3a03f19f007cd18cf747f580d20a6e4 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb @@ -0,0 +1,601 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "gpuClass": "standard" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 🖌️ **Finetuning Taiyi-Stable-Diffusion Colab Example**\n", + "\n", + "#####based on https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1\n" + ], + "metadata": { + "id": "-GisYq7cG41a" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Installing fengshen framework" + ], + "metadata": { + "id": "twrdGg5zaY0m" + } + }, + { + "cell_type": "code", + "source": [ + "from IPython.display import clear_output\n", + "\n", + "!pip install pytorch_lightning\n", + "!pip install transformers\n", + "!pip install deepspeed\n", + "!pip install diffusers\n", + "!pip install datasets\n", + "!pip install accelerate\n", + "\n", + "!git clone https://github.com/IDEA-CCNL/Fengshenbang-LM\n", + "\n", + "clear_output()\n", + "print(\"Done!\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Y24PHP7dG4gj", + "outputId": "8c444a57-dfc8-4e6e-84f6-f7cbdde03c68" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Done!\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "# 切换工作路径\n", + "os.chdir('/content/Fengshenbang-LM')\n", + "print(os.getcwd())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lwZ2CAgkLgda", + "outputId": "d2471d59-c1a5-43d1-fb19-055bf8dfde2c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/content/Fengshenbang-LM\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Building modules" + ], + "metadata": { + "id": "EMYaGij5acpb" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CnXybs4VFJnz" + }, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "import argparse\n", + "from pytorch_lightning import (\n", + " LightningModule,\n", + " Trainer,\n", + ")\n", + "from pytorch_lightning.callbacks import (\n", + " LearningRateMonitor,\n", + ")\n", + "from fengshen.data.universal_datamodule import UniversalDataModule\n", + "from fengshen.models.model_utils import (\n", + " add_module_args,\n", + " configure_optimizers,\n", + " get_total_steps,\n", + ")\n", + "from fengshen.utils.universal_checkpoint import UniversalCheckpoint\n", + "from transformers import BertTokenizer, BertModel\n", + "from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel\n", + "from torch.nn import functional as F\n", + "from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data\n", + "from torchvision import transforms\n", + "from PIL import Image\n", + "from torch.utils.data._utils.collate import default_collate\n", + "\n", + "\n", + "class Collator():\n", + " def __init__(self, args, tokenizer):\n", + " self.image_transforms = transforms.Compose(\n", + " [\n", + " transforms.Resize(\n", + " args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),\n", + " transforms.CenterCrop(\n", + " args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize([0.5], [0.5]),\n", + " ]\n", + " )\n", + " self.tokenizer = tokenizer\n", + "\n", + " def __call__(self, inputs):\n", + " examples = []\n", + " max_length = min(max([len(i['caption']) for i in inputs]), 512)\n", + " for i in inputs:\n", + " example = {}\n", + " instance_image = Image.open(i['img_path'])\n", + " if not instance_image.mode == \"RGB\":\n", + " instance_image = instance_image.convert(\"RGB\")\n", + " example[\"pixel_values\"] = self.image_transforms(instance_image)\n", + " example[\"input_ids\"] = self.tokenizer(\n", + " i['caption'],\n", + " padding=\"max_length\",\n", + " truncation=True,\n", + " max_length=max_length,\n", + " return_tensors='pt',\n", + " )['input_ids'][0]\n", + " examples.append(example)\n", + " return default_collate(examples)\n", + "\n", + "class StableDiffusion(LightningModule):\n", + " @staticmethod\n", + " def add_module_specific_args(parent_parser):\n", + " parser = parent_parser.add_argument_group('Taiyi Stable Diffusion Module')\n", + " parser.add_argument('--freeze_unet', action='store_true', default=False)\n", + " parser.add_argument('--freeze_text_encoder', action='store_true', default=False)\n", + " return parent_parser\n", + "\n", + " def __init__(self, args):\n", + " super().__init__()\n", + " self.tokenizer = BertTokenizer.from_pretrained(\n", + " args.model_path, subfolder=\"tokenizer\")\n", + " self.text_encoder = BertModel.from_pretrained(\n", + " args.model_path, subfolder=\"text_encoder\") # load from taiyi_finetune-v0\n", + " self.vae = AutoencoderKL.from_pretrained(\n", + " args.model_path, subfolder=\"vae\")\n", + " self.unet = UNet2DConditionModel.from_pretrained(\n", + " args.model_path, subfolder=\"unet\")\n", + " # TODO: 使用xformers配合deepspeed速度反而有下降(待确认\n", + " self.unet.set_use_memory_efficient_attention_xformers(False)\n", + "\n", + " self.noise_scheduler = DDPMScheduler(\n", + " beta_start=0.00085, beta_end=0.012, beta_schedule=\"scaled_linear\", num_train_timesteps=1000\n", + " )\n", + "\n", + " for param in self.vae.parameters():\n", + " param.requires_grad = False\n", + "\n", + " if args.freeze_text_encoder:\n", + " for param in self.text_encoder.parameters():\n", + " param.requires_grad = False\n", + "\n", + " if args.freeze_unet:\n", + " for param in self.unet.parameters():\n", + " param.requires_grad = False\n", + "\n", + " self.save_hyperparameters(args)\n", + "\n", + " def setup(self, stage) -> None:\n", + " if stage == 'fit':\n", + " self.total_steps = get_total_steps(self.trainer, self.hparams)\n", + " print('Total steps: {}' .format(self.total_steps))\n", + "\n", + " def configure_optimizers(self):\n", + " return configure_optimizers(self)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " self.text_encoder.train()\n", + "\n", + " latents = self.vae.encode(batch[\"pixel_values\"]).latent_dist.sample()\n", + " latents = latents * 0.18215\n", + "\n", + " # Sample noise that we'll add to the latents\n", + " noise = torch.randn(latents.shape).to(latents.device)\n", + " noise = noise.to(dtype=self.unet.dtype)\n", + " bsz = latents.shape[0]\n", + " # Sample a random timestep for each image\n", + " timesteps = torch.randint(\n", + " 0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)\n", + " timesteps = timesteps.long()\n", + " # Add noise to the latents according to the noise magnitude at each timestep\n", + " # (this is the forward diffusion process)\n", + "\n", + " noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)\n", + " noisy_latents = noisy_latents.to(dtype=self.unet.dtype)\n", + "\n", + " # Get the text embedding for conditioning\n", + " encoder_hidden_states = self.text_encoder(batch[\"input_ids\"])[0]\n", + "\n", + " # Predict the noise residual\n", + " noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample\n", + "\n", + " loss = F.mse_loss(noise_pred, noise, reduction=\"none\").mean([1, 2, 3]).mean()\n", + " self.log(\"train_loss\", loss.item(), on_epoch=False, prog_bar=True, logger=True)\n", + "\n", + " if self.trainer.global_rank == 0 and self.global_step == 100:\n", + " # 打印显存占用\n", + " from fengshen.utils.utils import report_memory\n", + " report_memory('stable diffusion')\n", + "\n", + " return {\"loss\": loss}\n", + "\n", + " def on_save_checkpoint(self, checkpoint) -> None:\n", + " if self.trainer.global_rank == 0:\n", + " print('saving model...')\n", + " pipeline = StableDiffusionPipeline.from_pretrained(\n", + " self.hparams.model_path,\n", + " text_encoder=self.text_encoder,\n", + " tokenizer=self.tokenizer,\n", + " unet=self.unet)\n", + " self.trainer.current_epoch\n", + " pipeline.save_pretrained(os.path.join(\n", + " args.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}'))\n", + "\n", + " def on_load_checkpoint(self, checkpoint) -> None:\n", + " # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0\n", + " global_step_offset = checkpoint[\"global_step\"]\n", + " if 'global_samples' in checkpoint:\n", + " self.consumed_samples = checkpoint['global_samples']\n", + " self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Settings" + ], + "metadata": { + "id": "jN-ATKxi1TUa" + } + }, + { + "cell_type": "code", + "source": [ + "from pprint import pprint\n", + "\n", + "args_parser = argparse.ArgumentParser()\n", + "args_parser = add_module_args(args_parser)\n", + "args_parser = add_data_args(args_parser)\n", + "args_parser = UniversalDataModule.add_data_specific_args(args_parser)\n", + "args_parser = Trainer.add_argparse_args(args_parser)\n", + "args_parser = StableDiffusion.add_module_specific_args(args_parser)\n", + "args_parser = UniversalCheckpoint.add_argparse_args(args_parser)\n", + "\n", + "# 你的数据集,可以参考 https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen/examples/finetune_taiyi_stable_diffusion 的demo_dataset的设置\n", + "your_dataset_path = '/content/Fengshenbang-LM/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset' #@param {type:\"string\"}\n", + "# 默认为下载huggingface上的模型\n", + "your_model_path = 'IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1' #@param {type:\"string\"}\n", + "train_batch_size = '1' #@param {type:\"string\"}\n", + "\n", + "message = [\n", + " '--datasets_path', your_dataset_path,\n", + " '--datasets_type', 'txt',\n", + " '--model_path', your_model_path,\n", + " '--train_batchsize', train_batch_size,\n", + " '--accelerator', 'gpu',\n", + " # '--strategy', 'deepspeed',\n", + " '--precision', '16',\n", + "]\n", + "\n", + "args = args_parser.parse_args(args=message)\n", + "\n", + "pprint(vars(args), width = 230)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KFnQFiQ_1S8w", + "outputId": "0802adc1-2b62-4557-96aa-796b5a2ca535" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'accelerator': 'gpu',\n", + " 'accumulate_grad_batches': None,\n", + " 'adam_beta1': 0.9,\n", + " 'adam_beta2': 0.999,\n", + " 'adam_epsilon': 1e-08,\n", + " 'amp_backend': None,\n", + " 'amp_level': None,\n", + " 'auto_lr_find': False,\n", + " 'auto_scale_batch_size': False,\n", + " 'auto_select_gpus': None,\n", + " 'benchmark': None,\n", + " 'center_crop': False,\n", + " 'check_val_every_n_epoch': 1,\n", + " 'dataloader_workers': 2,\n", + " 'datasets_name': None,\n", + " 'datasets_path': ['/content/Fengshenbang-LM/fengshen/examples/finetune_taiyi_stable_diffusion/demo_dataset'],\n", + " 'datasets_type': ['txt'],\n", + " 'default_root_dir': None,\n", + " 'detect_anomaly': False,\n", + " 'devices': None,\n", + " 'enable_checkpointing': True,\n", + " 'enable_model_summary': True,\n", + " 'enable_progress_bar': True,\n", + " 'every_n_epochs': None,\n", + " 'every_n_train_steps': None,\n", + " 'fast_dev_run': False,\n", + " 'filename': 'model-ep{epoch:02d}-st{step:d}',\n", + " 'freeze_text_encoder': False,\n", + " 'freeze_unet': False,\n", + " 'gpus': None,\n", + " 'gradient_clip_algorithm': None,\n", + " 'gradient_clip_val': None,\n", + " 'inference_mode': True,\n", + " 'ipus': None,\n", + " 'learning_rate': 5e-05,\n", + " 'limit_predict_batches': None,\n", + " 'limit_test_batches': None,\n", + " 'limit_train_batches': None,\n", + " 'limit_val_batches': None,\n", + " 'load_ckpt_path': './ckpt/',\n", + " 'log_every_n_steps': 50,\n", + " 'logger': True,\n", + " 'lr_decay_ratio': 1.0,\n", + " 'lr_decay_steps': 0,\n", + " 'max_epochs': None,\n", + " 'max_steps': -1,\n", + " 'max_time': None,\n", + " 'min_epochs': None,\n", + " 'min_learning_rate': 1e-07,\n", + " 'min_steps': None,\n", + " 'mode': 'max',\n", + " 'model_path': 'IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1',\n", + " 'monitor': 'step',\n", + " 'move_metrics_to_cpu': False,\n", + " 'multiple_trainloader_mode': 'max_size_cycle',\n", + " 'num_nodes': 1,\n", + " 'num_processes': None,\n", + " 'num_sanity_val_steps': 2,\n", + " 'num_workers': 8,\n", + " 'overfit_batches': 0.0,\n", + " 'plugins': None,\n", + " 'precision': 16,\n", + " 'profiler': None,\n", + " 'raw_file_type': 'json',\n", + " 'reload_dataloaders_every_n_epochs': 0,\n", + " 'replace_sampler_ddp': True,\n", + " 'resolution': 512,\n", + " 'resume_from_checkpoint': None,\n", + " 'sampler_type': 'random',\n", + " 'save_ckpt_path': './ckpt/',\n", + " 'save_last': False,\n", + " 'save_on_train_epoch_end': None,\n", + " 'save_top_k': 10,\n", + " 'save_weights_only': False,\n", + " 'scheduler_type': 'polynomial',\n", + " 'strategy': None,\n", + " 'sync_batchnorm': False,\n", + " 'test_batchsize': 16,\n", + " 'test_datasets_field': 'test',\n", + " 'test_file': None,\n", + " 'thres': 0.2,\n", + " 'tpu_cores': None,\n", + " 'track_grad_norm': -1,\n", + " 'train_batchsize': 1,\n", + " 'train_datasets_field': 'train',\n", + " 'train_file': None,\n", + " 'val_batchsize': 16,\n", + " 'val_check_interval': None,\n", + " 'val_datasets_field': 'validation',\n", + " 'val_file': None,\n", + " 'warmup_ratio': 0.1,\n", + " 'warmup_steps': 0,\n", + " 'weight_decay': 0.1}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Start training" + ], + "metadata": { + "id": "sgSAEhHoagek" + } + }, + { + "cell_type": "code", + "source": [ + "!nvidia-smi\n", + "!cat /proc/meminfo" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yALlfBnj4AUF", + "outputId": "528bd7a5-1c9a-48e5-d92a-471ac9774dde" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mon Feb 13 05:26:48 2023 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 71C P0 32W / 70W | 3MiB / 15360MiB | 0% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n", + "MemTotal: 26690612 kB\n", + "MemFree: 22642924 kB\n", + "MemAvailable: 24857828 kB\n", + "Buffers: 48944 kB\n", + "Cached: 2435656 kB\n", + "SwapCached: 0 kB\n", + "Active: 331644 kB\n", + "Inactive: 3390024 kB\n", + "Active(anon): 1568 kB\n", + "Inactive(anon): 1233664 kB\n", + "Active(file): 330076 kB\n", + "Inactive(file): 2156360 kB\n", + "Unevictable: 0 kB\n", + "Mlocked: 0 kB\n", + "SwapTotal: 0 kB\n", + "SwapFree: 0 kB\n", + "Dirty: 396 kB\n", + "Writeback: 0 kB\n", + "AnonPages: 1237216 kB\n", + "Mapped: 537236 kB\n", + "Shmem: 1304 kB\n", + "KReclaimable: 103288 kB\n", + "Slab: 147424 kB\n", + "SReclaimable: 103288 kB\n", + "SUnreclaim: 44136 kB\n", + "KernelStack: 5216 kB\n", + "PageTables: 21332 kB\n", + "NFS_Unstable: 0 kB\n", + "Bounce: 0 kB\n", + "WritebackTmp: 0 kB\n", + "CommitLimit: 13345304 kB\n", + "Committed_AS: 3402512 kB\n", + "VmallocTotal: 34359738367 kB\n", + "VmallocUsed: 57876 kB\n", + "VmallocChunk: 0 kB\n", + "Percpu: 2672 kB\n", + "HardwareCorrupted: 0 kB\n", + "AnonHugePages: 0 kB\n", + "ShmemHugePages: 0 kB\n", + "ShmemPmdMapped: 0 kB\n", + "FileHugePages: 0 kB\n", + "FilePmdMapped: 0 kB\n", + "CmaTotal: 0 kB\n", + "CmaFree: 0 kB\n", + "HugePages_Total: 0\n", + "HugePages_Free: 0\n", + "HugePages_Rsvd: 0\n", + "HugePages_Surp: 0\n", + "Hugepagesize: 2048 kB\n", + "Hugetlb: 0 kB\n", + "DirectMap4k: 484160 kB\n", + "DirectMap2M: 15241216 kB\n", + "DirectMap1G: 13631488 kB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pytorch_lightning as pl\n", + "print(pl.__version__)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s1vid40pLVDF", + "outputId": "25a92804-0fbd-4004-83fd-80e763286555" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1.9.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "lr_monitor = LearningRateMonitor(logging_interval='step')\n", + "checkpoint_callback = UniversalCheckpoint(args)\n", + "\n", + "trainer = Trainer.from_argparse_args(args,\n", + " callbacks=[\n", + " lr_monitor,\n", + " checkpoint_callback])\n", + "\n", + "model = StableDiffusion(args)\n", + "tokenizer = model.tokenizer\n", + "\n", + "datasets = load_data(args, global_rank=trainer.global_rank)\n", + "collate_fn = Collator(args, tokenizer)\n", + "\n", + "datamoule = UniversalDataModule(\n", + " tokenizer=tokenizer, collate_fn=collate_fn, args=args, datasets=datasets)\n", + "\n", + "trainer.fit(model, datamoule)" + ], + "metadata": { + "id": "b4nSmmNrLVwG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "It might be OOM, which is caused by low GPU memory in Colab.\n", + "\n", + "This notebook proves that our codes can run in our settings." + ], + "metadata": { + "id": "9DnOM7qNbokd" + } + } + ] +} diff --git a/fengshen/examples/finetune_taiyi_stable_diffusion/readme.md b/fengshen/examples/finetune_taiyi_stable_diffusion/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..62c5b8b488ed6a45c0eab17cf59ceda0fc335194 --- /dev/null +++ b/fengshen/examples/finetune_taiyi_stable_diffusion/readme.md @@ -0,0 +1,46 @@ +# Taiyi-Stable-Diffusion Finetune示例 + +本示例可以应用于**IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1**在自建的数据集上进行进一步训练,同时稍微修改代码也能够兼容大部分Stable-Diffusion结构。本示例仅提供参考,有任何疑问或者有需要协助的都可以提Issue到本项目中,会有专门的同学解答~ + +注:已更新了[colab的example](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/fengshen/examples/finetune_taiyi_stable_diffusion/finetune_taiyi_stable_diffusion_example.ipynb) + +## 数据处理 + +在./demo_dataset下有我们一个数据集的样例,其中一个sample由.jpg格式图片以及.txt文本文件组成,用户可以按照我们的格式处理然后直接将脚本内的datasets_path修改为自己的路径即可。(数据摘自[IDEA-CCNL/laion2B-multi-chinese-subset](https://huggingface.co./datasets/IDEA-CCNL/laion2B-multi-chinese-subset)) + +## 配置要求 + +Finetune **IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1** 十亿级别参数,我们自己测试所需要的配置基础如下。batch_size设定为1 + +fp32: + +- 显存:26G以上 +- 内存:64G以上 + +fp16: + +- 显存:21G以上 +- 内存:64G以上 + +fp16 + deepspeed offload + +- 显存:6G以上 +- 内存:80G以上 + +## 运行脚本 + +处理好自己的数据集后,只需要将脚本中的datasets_path指向你的数据集,不需要修改其他参数就能运行。在脚本中也提供了丰富的超参供大家修改,例如batch_size, ckpt_path等等都可以根据自己的需求做更改,其中model_path指向的是huggingface上的模型路径,下载可能比较慢,如果用户已经在本地下载过一份权重,直接将model_path改成本地路径即可。 + +一些常用的参数我们会放在[封神榜的文档里](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/%E5%B0%81%E7%A5%9E%E6%A1%86%E6%9E%B6/%E5%8F%82%E6%95%B0%E7%AE%A1%E7%90%86.html) + +有任何不清楚的地方,不要吝啬你的Issue,直接提过来。 + +## 一些训练中的Trick + +### Deepspeed + +在示例中我们默认开始了Deepspeed,通过Deepspeed我们能提高不少训练效率(即使是单卡)。并且得益于Zero Redundancy Optimizer的技术,在多卡的环境我们能显著的减少显存占用,提高batch_size以获得更高的效率,强烈建议有条件的同学开启Deepspeed。 + +### 8BitAdam + +TODO: 优化显存以及提高训练效率 diff --git a/fengshen/examples/hubert/pretrain_hubert.py b/fengshen/examples/hubert/pretrain_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..6506364b9498c5b994c085e1a5342082283ef62b --- /dev/null +++ b/fengshen/examples/hubert/pretrain_hubert.py @@ -0,0 +1,287 @@ +import fengshen.data.hubert.hubert_dataset as datasets +from fengshen.data.universal_datamodule import UniversalDataModule +from transformers import HubertConfig, HubertModel +# from transformers.models.hubert.modeling_hubert import _compute_mask_indices +import argparse +from fairseq.data import Dictionary +from pytorch_lightning import ( + LightningModule, + Trainer, + loggers, +) +from pytorch_lightning.callbacks import LearningRateMonitor +import torch +import os +import torch.nn.functional as F +import torch.nn as nn + + +class LabelEncoder(object): + def __init__(self, dictionary: Dictionary): + self.dictionary = dictionary + + def __call__(self, label: str): + return self.dictionary.encode_line( + label, + append_eos=False, + add_if_not_exist=False, + ) + + +class HubertPretrainDataLoader(): + def __init__(self, args): + self.cfg = args + self.dictionaries = self.load_dictionaries() + self.load_datasets = {} + + # TODO 改成HuggingFace Tokenizer + def load_dictionaries(self): + label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir + dictionaries = [ + Dictionary.load(f"{label_dir}/dict.{label}.txt") + for label in self.cfg.labels + ] + return dictionaries + + def get_label_dir(self): + if self.cfg.label_dir is None: + return self.cfg.data + return self.cfg.label_dir + + @property + def datasets(self): + return self.load_datasets + + def load_dataset(self, split: str, **kwargs): + manifest = f"{self.cfg.data}/{split}.tsv" + dicts = self.dictionaries + pad_list = [dict.pad() for dict in dicts] + eos_list = [dict.eos() for dict in dicts] + procs = [LabelEncoder(dict) for dict in dicts] + paths = [f"{self.get_label_dir()}/{split}.{lb}" for lb in self.cfg.labels] + + # hubert v1: pad_audio=True, random_crop=False; + self.load_datasets[split] = datasets.HubertDataset( + manifest, + sample_rate=self.cfg.sample_rate, + label_paths=paths, + label_rates=self.cfg.label_rate, + pad_list=pad_list, + eos_list=eos_list, + label_processors=procs, + max_keep_sample_size=self.cfg.max_keep_size, + min_keep_sample_size=self.cfg.min_sample_size, + max_sample_size=self.cfg.max_sample_size, + pad_audio=self.cfg.pad_audio, + normalize=self.cfg.normalize, + store_labels=False, + random_crop=self.cfg.random_crop, + single_target=self.cfg.single_target, + ) + + +def perpare_data(args): + loader = HubertPretrainDataLoader(args) + loader.load_dataset('train') + loader.load_dataset('valid') + return loader + + +class HubertLightning(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('HuBert Lightning') + parser.add_argument('--pred_masked_weight', type=float, default=1.0) + parser.add_argument('--logit_temp', type=float, default=1.0) + parser.add_argument('--loss_weights', type=float, nargs='+') + # parser.add_argument('--mask_prob', type=float, default=0.65) + # parser.add_argument('--mask_length', type=int, default=10) + # parser.add_argument('--mask_selection', type=str, default='static', + # choice=["static", "uniform", "normal", "poisson"]) + # parser.add_argument('--mask_other', type=float, default=0) + # parser.add_argument('--no_mask_overlap', type=bool, default=False) + # parser.add_argument('--mask_min_space', type=int, default=1) + return parent_parser + + def __init__(self, args, loader, ** kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + config = HubertConfig.from_pretrained(args.model_path) + self.config = config + self.model = HubertModel(config=config) + self.num_classes = [len(d) for d in loader.dictionaries] + self.label_embs_concat = nn.Parameter( + torch.FloatTensor(sum(self.num_classes), self.config.conv_dim[-1] // 2) + ) + self.final_proj = nn.Linear( + self.config.hidden_size, self.config.conv_dim[-1] // 2 * len(loader.dictionaries) + ) + nn.init.uniform_(self.label_embs_concat) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + def compute_nce(self, x, pos, negs): + neg_is_pos = (pos == negs).all(-1) + pos = pos.unsqueeze(0) + targets = torch.cat([pos, negs], dim=0) + + logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x) + logits /= self.hparams.logit_temp + if neg_is_pos.any(): + logits[1:][neg_is_pos] = float("-inf") + logits = logits.transpose(0, 1) # (num_x, num_cls+1) + return logits + + def forward(self, **batch): + + target_list = batch['target_list'] + padding_mask = batch['net_input']['padding_mask'] + input_values = batch['net_input']['source'] + output = self.model(input_values=input_values, + attention_mask=padding_mask, + target_list=target_list, + mask_time_indices=None, + return_dict=False) + + def compute_pred(proj_x, target, label_embs): + # compute logits for the i-th label set + y = torch.index_select(label_embs, 0, target.long()) + negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1) + # proj_x: (S, D) + # y: (S, D) + # negs: (Neg, S, D) + return self.compute_nce(proj_x, y, negs) + + label_embs_list = self.label_embs_concat.split(self.num_classes, 0) + + x, extra_losses, target_list, mask_indices, padding_mask = output[ + 0], output[-4], output[-3], output[-2], output[-1] + + masked_indices = torch.logical_and(~padding_mask, mask_indices) + proj_x_m = self.final_proj(x[masked_indices]) + proj_x_m_list = proj_x_m.chunk(len(target_list), dim=-1) + logp_m_list = [ + compute_pred(proj_x_m, t[masked_indices], label_embs_list[i]) + for i, (proj_x_m, t) in enumerate(zip(proj_x_m_list, target_list)) + ] + + targ_m_list = [x.new_zeros(x.size(0), dtype=torch.long) for x in logp_m_list] + + loss = 0.0 + loss_m_list = [] + + for i, (logp_m, targ_m) in enumerate(zip(logp_m_list, targ_m_list)): + loss_m = F.cross_entropy(logp_m, targ_m) + loss_m_list.append(loss_m) + self.log(f"loss_m_{i}", loss_m.detach().item()) + + loss += self.hparams.pred_masked_weight * sum(loss_m_list) + + loss_weights = self.hparams.loss_weights + if loss_weights is not None: + if torch.is_tensor(extra_losses): + extra_losses = [extra_losses] + names = ['extra'] + if len(loss_weights) == 1 and len(extra_losses) != 1: + loss_weights = [loss_weights[0]] * len(extra_losses) + assert len(extra_losses) == len( + loss_weights + ), f"{len(extra_losses)}, {len(loss_weights)}" + for p, n, coef in zip(extra_losses, names, loss_weights): + if coef != 0 and p is not None: + p = coef * p.float() + loss += p + self.log(f"loss_{n}", p.item()) + + return {'loss': loss} + + def training_step(self, batch, batch_idx): + output = self(**batch) + self.log('train_loss', output['loss']) + return output + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float()) / y_true.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + output = self(**batch) + # self.log('val_loss', output.loss, sync_dist=True) + # acc = self.comput_metrix(output.logits, batch['labels']) + # self.log('val_acc', acc, sync_dist=True) + return output + + def on_save_checkpoint(self, checkpoint) -> None: + # Save the current loop info in the mid of epoch + # if you lightning <= 1.6.0 uncomment the line below + # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict() + if self.trainer.global_rank == 0: + self.model.save_pretrained(os.path.join( + self.trainer.checkpoint_callback.dirpath, + 'hf_pretrained_epoch{}_step{}'.format(self.trainer.current_epoch, self.trainer.global_step))) + + def on_load_checkpoint(self, checkpoint) -> None: + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + from fengshen.utils import UniversalCheckpoint + from fengshen.models.model_utils import add_module_args + args_parser = add_module_args(args_parser) + args_parser = datasets.add_data_specific_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = HubertLightning.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args_parser.add_argument('--ckpt_path', type=str, ) + args = args_parser.parse_args() + + data_module = UniversalDataModule(args=args, tokenizer=None, collate_fn=None) + data_loader = perpare_data(args) + data_module.datasets = data_loader.datasets + module = HubertLightning(args, loader=data_loader) + + lr_monitor = LearningRateMonitor(logging_interval='step') + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'logs/'), + name=os.path.basename(os.path.dirname(args.model_path))) + checkpoint_callback = UniversalCheckpoint(args).callbacks + + if args.ckpt_path is not None and \ + not os.path.exists(args.ckpt_path): + print('--------warning no checkpoint found--------, remove args') + args.ckpt_path = None + + trainer = Trainer.from_argparse_args(args, + logger=logger, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + trainer.fit(module, data_module, ckpt_path=args.ckpt_path) diff --git a/fengshen/examples/hubert/pretrain_hubert_base.sh b/fengshen/examples/hubert/pretrain_hubert_base.sh new file mode 100644 index 0000000000000000000000000000000000000000..11e5ddf38361d51910c35b02f10b7e285ab3f0fb --- /dev/null +++ b/fengshen/examples/hubert/pretrain_hubert_base.sh @@ -0,0 +1,120 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_bart # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +MODEL_NAME=hubert-base-ls960 +config_json="./$MODEL_NAME.ds_config.json" +export MASTER_PORT=29503 +MICRO_BATCH_SIZE=8 +ZERO_STAGE=1 + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "tensorboard": { + "enabled": true, + "output_path": "/data/training_model/fengshen-${MODEL_NAME}/ds-tb-logs", + "job_name": "${MODEL_NAME}" + }, + "#flops_profiler": { + "enabled": true, + "profile_step": 200, + "detailed": true, + "output_file": null + }, + "steps_per_print": 100, + "gradient_clipping": 1, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "zero_allow_untested_optimizer": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/home/gaoxinyu/torch_extendsions + +DATA_DIR=/data/common_data/librispeech_tsv/datas +LABELS_DIR=/data/common_data/librispeech_tsv/labels + +DATA_ARGS="\ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize 32 \ + --test_batchsize 8 \ + --val_datasets_field valid \ + --test_datasets_field valid \ + --sampler_type random \ + --data ${DATA_DIR} \ + --label_dir ${LABELS_DIR} \ + --labels km \ + --label_rate 100 \ + --max_sample_size 250000 \ + --min_sample_size 32000 \ + --pad_audio False \ + --random_crop True \ + --normalize False \ + " + +MODEL_ARGS="\ + --model_path /data/pretrained_model/$MODEL_NAME/ \ + --learning_rate 1e-4 \ + --weight_decay 1e-2 \ + --warmup_ratio 0.01 \ + --pred_masked_weight 1.0 \ + --loss_weights 10 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor train_loss \ + --save_top_k 0 \ + --mode min \ + --every_n_train_steps 10000 \ + --dirpath /data/training_model/ckpt/fengshen-$MODEL_NAME \ + --filename model-{step:02d}-{train_loss:.4f} \ + --every_n_epochs 0 \ + --save_last \ + --not_save_on_train_epoch_end \ + " + +# deepspeed_stage_${ZERO_STAGE} \ +TRAINER_ARGS="\ + --gradient_clip_val 1.0 \ + --max_epochs 10 \ + --gpus 2 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --log_every_n_steps 100 \ + --val_check_interval 500 \ + --limit_val_batches 10 \ + --accumulate_grad_batches 1 \ + --precision 16 \ + --ckpt_path /data/training_model/ckpt/fengshen-${MODEL_NAME}/last.ckpt \ + --default_root_dir /data/training_model/fengshen-$MODEL_NAME \ + " + + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +export SCRIPT_PATH=pretrain_hubert.py + +eval python3 -m debugpy --listen localhost:53005 --wait-for-client $SCRIPT_PATH $options diff --git a/fengshen/examples/longformer/README.md b/fengshen/examples/longformer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ef4706898b87d2f10eff5df2db24ae3a182ce673 --- /dev/null +++ b/fengshen/examples/longformer/README.md @@ -0,0 +1,34 @@ +# longformer model (Chinese),one model of [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM). +We modify the original position code of longformer to rotational position coding,and on the basis of [chinese_roformer_L-12_H-768_A-12.zip](https://github.com/ZhuiyiTechnology/roformer), use 180G of data to continue training + +## Usage +There is no structure of Longformer-base in [Transformers](https://github.com/huggingface/transformers), you can run follow code to get structure of longformer from [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM) + + ```shell + git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git + ``` + +### Load Model +```python +from fengshen import LongformerModel +from fengshen import LongformerConfig +from transformers import BertTokenizer + +tokenizer = BertTokenizer.from_pretrained("IDEA-CCNL/Erlangshen-Longformer-110M") +config = LongformerConfig.from_pretrained("IDEA-CCNL/Erlangshen-Longformer-110M") +model = LongformerModel.from_pretrained("IDEA-CCNL/Erlangshen-Longformer-110M") +``` + + + +## Citation +If you find the resource is useful, please cite the following website in your paper. + +``` +@misc{Fengshenbang-LM, + title={Fengshenbang-LM}, + author={IDEA-CCNL}, + year={2021}, + howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}}, +} +``` diff --git a/fengshen/examples/mt5_summary/fastapi_mt5_summary.py b/fengshen/examples/mt5_summary/fastapi_mt5_summary.py new file mode 100644 index 0000000000000000000000000000000000000000..44adaf8f5855260c683c0bcfe7986ffccc9f25c4 --- /dev/null +++ b/fengshen/examples/mt5_summary/fastapi_mt5_summary.py @@ -0,0 +1,93 @@ +import os +import sys +import uvicorn +import torch +from fastapi import Body, FastAPI +from transformers import T5Tokenizer, MT5ForConditionalGeneration +import pytorch_lightning as pl +sys.path.append(os.path.abspath(os.path.join( + os.path.dirname(__file__), os.path.pardir))) +os.environ["CUDA_VISIBLE_DEVICES"] = '5' +os.environ["MASTER_ADDR"] = '127.0.0.1' +os.environ["MASTER_PORT"] = '6000' +device = "cuda:0" if torch.cuda.is_available() else "cpu" +print('device') +pretrain_model_path = '/cognitive_comp/ganruyi/hf_models/google/mt5-large' +# pretrain_model_path = 'google/mt5-small' +model_path = '/cognitive_comp/ganruyi/fengshen/mt5_large_summary/ckpt/epoch-0-last.ckpt' +tokenizer = T5Tokenizer.from_pretrained(pretrain_model_path) +print('load tokenizer') + + +class MT5FinetuneSummary(pl.LightningModule): + + def __init__(self): + super().__init__() + self.model = MT5ForConditionalGeneration.from_pretrained(pretrain_model_path) + + +model = MT5FinetuneSummary.load_from_checkpoint(model_path) +print('load checkpoint') +model.to(device) +model.eval() +app = FastAPI() +print('server start') + +# def flask_gen(text: str, level: float = 0.9, n_sample: int = 5, length: int = 32, is_beam_search=False): + + +@app.post('/mt5_summary') +async def flask_gen(text: str = Body('', title='原文', embed=True), + n_sample: int = 5, length: int = 32, is_beam_search=False): + if len(text) > 128: + text = text[:128] + text = 'summary:'+text + print(text) + # inputs = tokenizer(text, return_tensors='pt') + inputs = tokenizer.encode_plus( + text, max_length=128, padding='max_length', truncation=True, return_tensors='pt') + # print(inputs) + if is_beam_search: + generated_ids = model.model.generate( + input_ids=inputs['input_ids'].to(device), + attention_mask=inputs['attention_mask'].to(device), + max_length=length, + num_beams=n_sample, + repetition_penalty=2.5, + length_penalty=1.0, + early_stopping=True, + num_return_sequences=n_sample + ) + else: + generated_ids = model.model.generate( + input_ids=inputs['input_ids'].to(device), + attention_mask=inputs['attention_mask'].to(device), + max_length=length, + do_sample=True, + temperature=1.0, + top_p=1.0, + repetition_penalty=2.5, + # early_stopping=True, + num_return_sequences=n_sample + ) + result = [] + # print(tokenizer.all_special_tokens) + for sample in generated_ids: + preds = [tokenizer.decode(sample, skip_special_tokens=True, + clean_up_tokenization_spaces=True)] + preds = ''.join(preds) + # print(preds) + result.append(preds) + return result + + +if __name__ == '__main__': + uvicorn.run(app, host="0.0.0.0", port=6607, log_level="debug") +# # article = "日前,方舟子发文直指林志颖旗下爱碧丽推销假保健品,引起哗然。调查发现, +# 爱碧丽没有自己的生产加工厂。其胶原蛋白饮品无核心研发,全部代工生产。号称有“逆生长”功效的爱碧丽“梦幻奇迹限量组”售价>高达1080元,实际成本仅为每瓶4元!" +# article = '''在北京冬奥会自由式滑雪女子坡面障碍技巧决赛中,中国选手谷爱凌夺得银牌。祝贺谷爱凌! +# 今天上午,自由式滑雪女子坡面障碍技巧决赛举行。决赛分三轮进行,取选手最佳成绩排名决出奖牌。 +# 第一跳,中国选手谷爱凌获得69.90分。在12位选手中排名第三。完成动作后,谷爱凌又扮了个鬼脸,甚是可爱。 +# 第二轮中,谷爱凌在道具区第三个障碍处失误,落地时摔倒。获得16.98分。网友:摔倒了也没关系,继续加油! +# 在第二跳失误摔倒的情况下,谷爱凌顶住压力,第三跳稳稳发挥,流畅落地!获得86.23分!此轮比赛,共12位选手参赛,谷爱凌第10位出场。网友:看比赛时我比谷爱凌紧张,加油!''' + # flask_gen(article, length=30) diff --git a/fengshen/examples/mt5_summary/mt5_summary.py b/fengshen/examples/mt5_summary/mt5_summary.py new file mode 100644 index 0000000000000000000000000000000000000000..de564026ae7a32873cc39515f421adfb9d7e4568 --- /dev/null +++ b/fengshen/examples/mt5_summary/mt5_summary.py @@ -0,0 +1,233 @@ +from fengshen.data.task_dataloader.task_datasets import LCSTSDataModel +from transformers import T5Tokenizer, MT5ForConditionalGeneration +from transformers.optimization import get_linear_schedule_with_warmup +from pytorch_lightning import Trainer, loggers +from pytorch_lightning.callbacks import ModelCheckpoint +from transformers import AutoTokenizer +import pytorch_lightning as pl +import json +import argparse +import torch +import os +import sys +sys.path.append('./') + +# os.environ["CUDA_VISIBLE_DEVICES"] = '4,5,6,7' + + +def test(): + tokenizer = T5Tokenizer.from_pretrained("google/mt5-small") + article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." + summary = "Weiter Verhandlung in Syrien." + article = "日前,方舟子发文直指林志颖旗下爱碧丽推销假保健品,引起哗然。调查发现,爱碧丽没有自己的生产加工厂。 \ + 其胶原蛋白饮品无核心研发,全部代工生产。号称有“逆生长”功效的爱碧丽“梦幻奇迹限量组”售价>高达1080元,实际成本仅为每瓶4元!" + summary = "林志颖公司疑涉虚假营销无厂房无研发" + inputs = tokenizer(article, rturn_tensors="pt") + tt = tokenizer.encode_plus(summary, max_length=64, + padding='max_length', truncation='longest_first') + print('tt:', tt) + print('inputs:', inputs) + with tokenizer.as_target_tokenizer(): + labels = tokenizer(summary, return_tensors="pt") + print('labels:', labels) + print('origin labels:', tokenizer.decode(labels['input_ids'][0])) + + model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") + # outputs = model(input_ids=inputs["input_ids"], labels=labels["input_ids"]) + # print(outputs.keys()) + + # evaluation + model.eval() + generated_ids = model.generate( + input_ids=inputs['input_ids'], + attention_mask=inputs['attention_mask'], + max_length=150, + num_beams=2, + repetition_penalty=2.5, + length_penalty=1.0, + early_stopping=True + ) + preds = [tokenizer.decode(g, skip_special_tokens=True, + clean_up_tokenization_spaces=True) for g in generated_ids] + print(preds) + + +class MT5FinetuneSummaryModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./ckpt/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + parser.add_argument('--save_last', action='store_true', default=True) + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename, + save_last=args.save_last) + + +class MT5FinetuneSummary(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--learning_rate', default=1e-4, type=float) + parser.add_argument('--weight_decay', default=0.1, type=float) + parser.add_argument('--warmup', default=0.01, type=float) + return parent_args + + def __init__(self, args, num_data): + super().__init__() + self.args = args + self.num_data = num_data + print('num_data:', num_data) + self.model = MT5ForConditionalGeneration.from_pretrained(args.pretrained_model_path) + + def setup(self, stage) -> None: + if stage == 'fit': + num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0 + self.total_step = int(self.trainer.max_epochs * self.num_data / + (max(1, num_gpus) * self.trainer.accumulate_grad_batches)) + print('Total training step:', self.total_step) + + def training_step(self, batch, batch_idx): + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], labels=batch['labels']) + # output = self.model(input_ids=batch['input_ids'], labels=batch['labels']) + # acc = self.comput_metrix(output.logits, batch['labels']) + self.log('train_loss', output.loss) + return output.loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], labels=batch['labels']) + # output = self.model(input_ids=batch['input_ids'], labels=batch['labels']) + # acc = self.comput_metrix(output.logits, batch['labels']) + self.log('val_loss', output.loss) + # self.log('val_acc', acc) + + def predict_step(self, batch, batch_idx): + text = batch['text'] + summary = batch['summary'] + generated_ids = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + max_length=self.args.max_dec_length + ) + return {"pred": generated_ids, "text": text, "summary": summary} + + def configure_optimizers(self): + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + paras = list( + filter(lambda p: p[1].requires_grad, self.named_parameters())) + paras = [{ + 'params': + [p for n, p in paras if not any(nd in n for nd in no_decay)], + 'weight_decay': self.args.weight_decay + }, { + 'params': [p for n, p in paras if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0 + }] + optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate) + scheduler = get_linear_schedule_with_warmup( + optimizer, int(self.total_step * self.args.warmup), + self.total_step) + + return [{ + 'optimizer': optimizer, + 'lr_scheduler': { + 'scheduler': scheduler, + 'interval': 'step', + 'frequency': 1 + } + }] + + +def save_test(data, args, data_model): + tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_path) + with open(os.path.join(args.output_save_path), 'w', encoding='utf-8') as f: + for _, batch in enumerate(data): + texts = batch['text'] + summarys = batch['summary'] + preds = batch['pred'] + for idx, pred_ids in enumerate(preds): + text = texts[idx] + summary = summarys[idx] + tmp_result = dict() + preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) + for g in pred_ids] + tmp_result['summary'] = ''.join(preds) + tmp_result['label'] = summary + tmp_result['origin_text'] = text + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data+'\n') + print('save the result to '+args.output_save_path) + + +def main(): + total_parser = argparse.ArgumentParser("Summary Task") + total_parser.add_argument('--do_eval_only', action='store_true', default=False) + total_parser.add_argument('--pretrained_model_path', default='google/mt5-small', type=str) + total_parser.add_argument('--output_save_path', default='./predict.json', type=str) + # * Args for data preprocessing + total_parser = LCSTSDataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = MT5FinetuneSummaryModelCheckpoint.add_argparse_args(total_parser) + total_parser = MT5FinetuneSummary.add_model_specific_args(total_parser) + # * Args for base model + args = total_parser.parse_args() + + data_model = LCSTSDataModel(args) + if not args.do_eval_only: + model = MT5FinetuneSummary(args, len(data_model.train_dataloader())) + checkpoint_callback = MT5FinetuneSummaryModelCheckpoint(args).callbacks + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'log/'), name='mt5_summary') + trainer = Trainer.from_argparse_args(args, + logger=logger, + callbacks=[checkpoint_callback] + ) + trainer.fit(model, data_model) + else: + trainer = Trainer.from_argparse_args(args) + model = MT5FinetuneSummary.load_from_checkpoint( + args.resume_from_checkpoint, args=args, num_data=len(data_model.predict_dataloader())) + result = trainer.predict(model, data_model) + if torch.distributed.get_rank() == 0: + save_test(result, args, data_model) + + +if __name__ == '__main__': + main() + # test() + +''' +python examples/mt5_summary.py --gpus=1 --test_data=test_public.jsonl +--default_root_dir=/cognitive_comp/ganruyi/fengshen/mt5_summary/eval +--do_eval_only +--resume_from_checkpoint=/cognitive_comp/ganruyi/fengshen/mt5_summary/ckpt/model-epoch=01-train_loss=1.9166.ckpt +--strategy=ddp +''' diff --git a/fengshen/examples/mt5_summary/pretrain_mt5_summary.sh b/fengshen/examples/mt5_summary/pretrain_mt5_summary.sh new file mode 100644 index 0000000000000000000000000000000000000000..a77b88006211d6f7a432672f4ac29a58d9865d66 --- /dev/null +++ b/fengshen/examples/mt5_summary/pretrain_mt5_summary.sh @@ -0,0 +1,118 @@ +#!/bin/bash +#SBATCH --job-name=mt5_large_summary +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 # number of gpus +#SBATCH -o /cognitive_comp/ganruyi/fengshen/mt5_large_summary/%x-%j.log +#SBATCH -e /cognitive_comp/ganruyi/fengshen/mt5_large_summary/%x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=16 +ROOT_DIR=/cognitive_comp/ganruyi/fengshen/mt5_large_summary + +ZERO_STAGE=2 + +config_json="$ROOT_DIR/ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 16, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-6, + "warmup_max_lr": 1e-5 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +# export PL_DEEPSPEED_CONFIG_PATH=$config_json + +TRAINER_ARGS=" + --max_epochs 2 \ + --gpus 4 \ + --num_nodes 1 \ + --strategy ddp \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor train_loss \ + --mode min \ + --save_last \ +" +DATA_DIR=/cognitive_comp/ganruyi/data_datasets_LCSTS_LCSTS/ +prompt="summary:" +DATA_ARGS=" + --data_dir $DATA_DIR + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data train.jsonl\ + --valid_data valid.jsonl\ + --test_data valid.jsonl\ + --prompt $prompt \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-large \ + --output_save_path $ROOT_DIR/mt5_large_predict_lcsts.json \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --warmup 0.01 \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/fengshen/examples/mt5_summary.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +#singularity exec --nv -B /cognitive_comp/ganruyi/Megatron/:/cognitive_comp/ganruyi/Megatron/,/cognitive_comp/gaoxinyu/:/cognitive_comp/gaoxinyu/ $SINGULARITY_PATH python $CMD + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" +clear; srun singularity exec --nv -B /cognitive_comp/ganruyi/:/cognitive_comp/ganruyi/ $SINGULARITY_PATH bash -c 'python $CMD' \ No newline at end of file diff --git a/fengshen/examples/pegasus/README.md b/fengshen/examples/pegasus/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f04b83c348ab2fe34a06a428523bc48169c7b478 --- /dev/null +++ b/fengshen/examples/pegasus/README.md @@ -0,0 +1,78 @@ +# 燃灯系列-Pegasus摘要模型预训练 +Pegasus预训练模型是专门为摘要任务而设计的预训练模型,相比于其它通用预训练模型,Pegasus 模型的架构设计更贴近下游的摘要任务,在摘要抽取的效果上的表现相比其他通用模型表现更好 + +### 模型架构和参数 +Pegasus的模型架构是标准的encoder-decoder的Transformer结构,训练任务是用的是GSG( Gap Sentences Generation)任务。GSG任务主要是通过对文本中的重要的句子进行mask,然后再通过decoder恢复。模型详细参数可看config.json + +1. base版本 + +| 配置 | 参数 | +| ---- | ---- | +| encoder layers | 12 | +| encoder_attention_heads | 12 | +| encoder_ffn_dim | 3072 | +| decoder layers | 12 | +| decoder_attention_heads| 12 | +| decoder_ffn_dim | 3072 | +| max_encode_length | 512 | + +2. large 版本 + +| 配置 | 参数 | +| ---- | ---- | +| encoder layers | 16 | +| encoder_attention_heads | 16 | +| encoder_ffn_dim | 4096 | +| decoder layers | 16 | +| decoder_attention_heads| 16 | +| decoder_ffn_dim | 4096 | +| max_encode_length | 1024 | + +### 训练数据 +训练数据使用的是wudao 180g数据。数据进行了简单的预处理包括: +1. 过滤过长单句(这样的句子通常会包括一些乱码句,无上下文语义的列表句、各种符号句,歌词句等) +2. 过滤句子数过少文本,如句子数少于3句则抛弃 + +### 模型 + +pegasus-base: [Randeng_pegasus_238M_summary](https://huggingface.co./IDEA-CCNL/Randeng_Pegasus_238M_Summary)
+pegasus-large: [Randeng_pegasus_523M_summary](https://huggingface.co./IDEA-CCNL/Randeng_Pegasus_523M_Summary) + +主要文件: +- tokenizers_pegasus.py 中文版pegasus的tokenize实现 +- pretrain_pegasus.py 模型预训练的核心实现文件 +- pretrain_pegasusu.sh 预训练脚本,具体参数可通过此脚本修改 +- data_utils.py 模型的一些工具代码 + +#### 使用方式 +可直接通过Hugging face或者pytoch-ligthning框架调用。下面给出的例子是hugging face的调用方法: +```python +from transformers import PegasusForConditionalGeneration +# Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance, +# or you can mv download in tokenizers_pegasus.py and data_utils.py in https://huggingface.co./IDEA-CCNL/Randeng_Pegasus_238M_Summary/tree/main +# Stronly recomend you git clone the Fengshenbang-LM repo: +# 1. git clone https://github.com/IDEA-CCNL/Fengshenbang-LM +# 2. cd Fengshenbang-LM/fengshen/examples/pegasus/ +# and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model +from tokenizers_pegasus import PegasusTokenizer + +model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/randeng_pegasus_238M_summary") +tokenizer = PegasusTokenizer.from_pretrained("path/to/vocab.txt") + +text = "在北京冬奥会自由式滑雪女子坡面障碍技巧决赛中,中国选手谷爱凌夺得银牌。祝贺谷爱凌!今天上午,自由式滑雪女子坡面障碍技巧决赛举行。决赛分三轮进行,取选手最佳成绩排名决出奖牌。第一跳,中国选手谷爱凌获得69.90分。在12位选手中排名第三。完成动作后,谷爱凌又扮了个鬼脸,甚是可爱。第二轮中,谷爱凌在道具区第三个障碍处失误,落地时摔倒。获得16.98分。网友:摔倒了也没关系,继续加油!在第二跳失误摔倒的情况下,谷爱凌顶住压力,第三跳稳稳发挥,流畅落地!获得86.23分!此轮比赛,共12位选手参赛,谷爱凌第10位出场。网友:看比赛时我比谷爱凌紧张,加油!" +inputs = tokenizer(text, max_length=1024, return_tensors="pt") + +# Generate Summary +summary_ids = model.generate(inputs["input_ids"]) +tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] +``` + +### 下游效果 + +#### LCSTS摘要数据finetune后效果 + +| model | rouge-1 | rouge-2 | rouge-L | +| ---- | ---- | ---- | ---- | +| Pegasus-base | 44.13 | 31.31 | 41.06 | +| Pegasus-large | 49.42 | 37.91 | 46.63 | + diff --git a/fengshen/examples/pegasus/data_utils.py b/fengshen/examples/pegasus/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..879798749bc06d6857c01ec101baf5f3fb61d012 --- /dev/null +++ b/fengshen/examples/pegasus/data_utils.py @@ -0,0 +1,319 @@ +# -*- coding: utf-8 -*- + +import re +import six +import unicodedata +import torch +import rouge +import numpy as np +import random +# from fengshen.examples.pegasus.pegasus_utils import text_segmentate +import sys + +sys.path.append('../../../') + +rouge = rouge.Rouge() + + +is_py2 = six.PY2 + +if not is_py2: + basestring = str + + +def _is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) + or (cp >= 0x20000 and cp <= 0x2A6DF) + or (cp >= 0x2A700 and cp <= 0x2B73F) + or (cp >= 0x2B740 and cp <= 0x2B81F) + or (cp >= 0x2B820 and cp <= 0x2CEAF) + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F)): + return True + + return False + + +def _is_whitespace(char): + """Checks whether `char` is a whitespace character.""" + # \t, \n, and \r are technically control characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `char` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `char` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( + cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def is_string(s): + """判断是否是字符串 + """ + return isinstance(s, basestring) + + +def is_stopwords(word, stopwords): + if word in stopwords: + return True + else: + return False + + +def text_segmentate(text): + en_seg_pattern = '((?:\\!|\\?|\\.|\\n)+(?:\\s)+)' + ch_seg_pattern = '((?:?|!|。|\\n)+)' + try: + text = re.sub(en_seg_pattern, r'\1[SEP]', text) + # print("sub text: ", text) + except Exception as e: + print("input: ", text) + raise e + text = re.sub(ch_seg_pattern, r'\1[SEP]', text) + # print("sub ch text: ", text) + text_list = text.split("[SEP]") + text_list = list(filter(lambda x: len(x) != 0, text_list)) + return text_list + + +def load_stopwords(stopwords_path): + stopwords_dict = {} + with open(stopwords_path, "r") as rf: + for line in rf: + line = line.strip() + if line not in stopwords_dict: + stopwords_dict[line] = 0 + else: + pass + return stopwords_dict + + +def text_process(text, max_length): + """分割文本 + """ + texts = text_segmentate(text) + + result, length = [], 0 + for text in texts: + if length + len(text) > max_length * 1.3 and len(result) >= 3: + yield result + result, length = [], 0 + result.append(text) + length += len(text) + if result and len(result) >= 3: + yield result + + +def text_process_split_long_content(text, max_length): + """分割长文本 + """ + texts = text_segmentate(text) + + result, sentence_num = "", 0 + for text in texts: + if len(text) > 500: + if len(result) > 300 and sentence_num >= 3: + yield result + result, sentence_num = "", 0 + else: + result, sentence_num = "", 0 + continue + else: + if len(result) + len(text) > max_length * 1.1 and sentence_num >= 3: + yield result + result, sentence_num = "", 0 + result += text + sentence_num += 1 + + if result and sentence_num >= 3: + yield result + + +def gather_join(texts, idxs): + """取出对应的text,然后拼接起来 + """ + return ''.join([texts[i] for i in idxs]) + + +def gather_join_f1(texts_token, idsx): + join_texts = [] + for id in idsx: + join_texts.extend(texts_token[id]) + return join_texts + + +def compute_rouge(source, target): + """计算rouge-1、rouge-2、rouge-l + """ + source, target = ' '.join(source), ' '.join(target) + try: + scores = rouge.get_scores(hyps=source, refs=target) + return { + 'rouge-1': scores[0]['rouge-1']['f'], + 'rouge-2': scores[0]['rouge-2']['f'], + 'rouge-l': scores[0]['rouge-l']['f'], + } + except ValueError: + return { + 'rouge-1': 0.0, + 'rouge-2': 0.0, + 'rouge-l': 0.0, + } + + +def remove_stopwords(texts, stopwords_dict): + for i, text in enumerate(texts): + texts[i] = list(filter(lambda x: x not in stopwords_dict, text)) + return texts + + +def pseudo_summary_f1(texts, + stopwords, + tokenizer, + max_length, + rouge_strategy="rouge-l"): + """构建伪标签摘要数据集 + """ + summary_rate = 0.25 + max_length = max_length - 1 + texts_tokens = [] + sentece_idxs_vec = [] + for text in texts: + if len(texts) == 0: + continue + try: + ids = tokenizer.encode(text.strip())[:-1] + except ValueError: + print("error, input : ", text) + raise ValueError + sentece_idxs_vec.append(ids) + tokens = [tokenizer._convert_id_to_token(token) for token in ids] + texts_tokens.append(tokens) + + texts_tokens_rm = remove_stopwords(texts_tokens, stopwords) + source_idxs, target_idxs = list(range(len(texts))), [] + + assert len(texts_tokens) == len(texts) + # truncate_index = 0 + while True: + sims = [] + for i in source_idxs: + new_source_idxs = [j for j in source_idxs if j != i] + new_target_idxs = sorted(target_idxs + [i]) + new_source = gather_join_f1(texts_tokens_rm, new_source_idxs) + new_target = gather_join_f1(texts_tokens_rm, new_target_idxs) + sim = compute_rouge(new_source, new_target)[rouge_strategy] + sims.append(sim) + new_idx = source_idxs[np.argmax(sims)] + del sims + source_idxs.remove(new_idx) + target_idxs = sorted(target_idxs + [new_idx]) + source = gather_join(texts, source_idxs) + target = gather_join(texts, target_idxs) + try: + if (len(source_idxs) == 1 + or 1.0 * len(target) / len(source) > summary_rate): + break + except ZeroDivisionError as e: + print(e.meesage) + print(texts) + print("source: ", source) + print("target: ", target) + + if len(source) < len(target): + source, target = target, source + source_idxs, target_idxs = target_idxs, source_idxs + + return sentece_idxs_vec, source, target, source_idxs, target_idxs + + +def get_input_mask(sentence_id_vec, indexs): + target_idxs = [] + input_idxs = [] + kMaskSentenceTokenId = 2 + kEosTokenId = 1 + mask_sentence_options_cumulative_prob = [0.9, 0.9, 1, 1] + for index in indexs: + target_idxs.extend(sentence_id_vec[index]) + choice = random.uniform(0, 1) + if choice < mask_sentence_options_cumulative_prob[0]: + # print("mask index: ", index) + sentence_id_vec[index] = [kMaskSentenceTokenId] + elif choice < mask_sentence_options_cumulative_prob[1]: + # print("replace index: ", index) + replace_id = random.randint(0, len(sentence_id_vec)) + sentence_id_vec[index] = sentence_id_vec[replace_id] + elif choice < mask_sentence_options_cumulative_prob[2]: + pass + else: + sentence_id_vec[index] = [] + + target_idxs.append(kEosTokenId) + # print(sentence_id_vec) + for index, sentence_id in enumerate(sentence_id_vec): + # print(index, sentence_id) + if len(sentence_id) == 0: + continue + input_idxs.extend(sentence_id_vec[index]) + + input_idxs.append(kEosTokenId) + return input_idxs, target_idxs + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, + decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def padding_to_maxlength(ids, max_length, pad_id): + cur_len = len(ids) + len_diff = max_length - cur_len + return ids + [pad_id] * len_diff, [1] * cur_len + [0] * len_diff diff --git a/fengshen/examples/pegasus/pretrain_pegasus.py b/fengshen/examples/pegasus/pretrain_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..0059355f5d5bf6d149e01fc3dc15d3a760932733 --- /dev/null +++ b/fengshen/examples/pegasus/pretrain_pegasus.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- + + +from fengshen.models.model_utils import add_module_args +from transformers import PegasusForConditionalGeneration, PegasusConfig +from pytorch_lightning import Trainer, loggers, LightningModule +from pytorch_lightning.callbacks import LearningRateMonitor +from tokenizers_pegasus import PegasusTokenizer +from utils import UniversalCheckpoint +from data.universal_datamodule import UniversalDataModule +from data_utils import ( + get_input_mask, pseudo_summary_f1, shift_tokens_right, + padding_to_maxlength, load_stopwords, text_segmentate) +import argparse +import torch +import os +import sys + +sys.path.append('../../') + + +# os.environ["CUDA_VISIBLE_DEVICES"] = '6' + + +class FakeAbstractCollator: + + def __init__(self, tokenizer, stopwords_dict, max_enc_length): + self.tokenizer = tokenizer + self.max_seq_length = max_enc_length + self.stopwords_dict = stopwords_dict + + def __call__(self, samples): + # print("samples: ", samples) + labels = [] + attn_mask = [] + decoder_attn_mask = [] + source_inputs = [] + + for text in samples: + texts = text["chunks"] + text = text_segmentate(texts) + sentence_id_vec, source, target, source_idxs, target_idxs = pseudo_summary_f1( + text, self.stopwords_dict, self.tokenizer, self.max_seq_length, + "rouge-l") + source_idxs, target_idxs = get_input_mask(sentence_id_vec, + target_idxs) + if len(source_idxs) > self.max_seq_length: + if 2 not in source_idxs[self.max_seq_length - 1:]: + source_idxs = source_idxs[:self.max_seq_length] + source_idxs[-1] = self.tokenizer.eos_token_id + sys.stderr.write("Warning split long line: " + source + + "\n") + else: + continue + + source_idxs, attention_mask = padding_to_maxlength( + source_idxs, self.max_seq_length, self.tokenizer.pad_token_id) + label, target_attention_mask = padding_to_maxlength( + target_idxs, self.max_seq_length, self.tokenizer.pad_token_id) + # print("sample len: ", len(source_idxs)) + source_inputs.append(source_idxs) + attn_mask.append(attention_mask) + decoder_attn_mask.append(target_attention_mask) + labels.append(label) + labels = torch.tensor(labels) + decode_input_idxs = shift_tokens_right(labels, + self.tokenizer.pad_token_id, + self.tokenizer.pad_token_id) + end_token_index = torch.where(labels == self.tokenizer.eos_token_id)[1] + for idx, end_idx in enumerate(end_token_index): + labels[idx][end_idx + 1:] = -100 + + # print("call samples: ") + return { + "input_ids": torch.tensor(source_inputs), + "attention_mask": torch.tensor(attn_mask), + "labels": labels, + "decoder_input_ids": decode_input_idxs, + "decoder_attention_mask": torch.tensor(decoder_attn_mask) + } + + +class PegasusChineseModel(LightningModule): + + def __init__(self, args, **kwargs): + super().__init__() + self.args = args + self.save_hyperparameters(args) + config = PegasusConfig.from_json_file( + os.path.join(args.model_path, "config.json")) + print("vocab_size: ", config.vocab_size) + self.model = PegasusForConditionalGeneration(config=config) + print("model.num_parameters: ", self.model.num_parameters()) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader( + ) + + # Calculate total steps + tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus) + ab_size = self.trainer.accumulate_grad_batches * float( + self.trainer.max_epochs) + self.total_steps = (len(train_loader.dataset) // + tb_size) // ab_size + print('Total training step:', self.total_steps) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + def training_step(self, batch, batch_idx): + output = self.model(**batch) + self.log('train_loss', output.loss, sync_dist=True) + return output.loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1, )) + y_true = labels.view(size=(-1, )).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float()) / labels.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + output = self.model(**batch) + acc = self.comput_metrix(output.logits, batch['labels']) + self.log('val_loss', output.loss, sync_dist=True) + self.log('val_acc', acc, sync_dist=True) + + def on_save_checkpoint(self, checkpoint) -> None: + if self.trainer._accelerator_connector.cluster_environment.global_rank( + ) == 0: + self.model.save_pretrained( + os.path.join( + self.trainer.checkpoint_callback.dirpath, + 'hf_pretrained_epoch{}_step{}'.format( + checkpoint['epoch'], checkpoint['global_step']))) + + +def main(): + args_parser = argparse.ArgumentParser("Pegasus Task") + + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args_parser = add_module_args(args_parser) + args_parser.add_argument('--deepspeed') + args_parser.add_argument( + '--stopword_path', + default="/cognitive_comp/dongxiaoqun/project/pegasus/own/pegasus/stopwords", + type=str) + args_parser.add_argument('--max_seq_length', default=1024, type=int) + args = args_parser.parse_args() + + tokenizer = PegasusTokenizer.from_pretrained(args.model_path) + stopwords_dict = load_stopwords(args.stopword_path) + collator = FakeAbstractCollator(tokenizer, stopwords_dict, + args.max_seq_length) + data_module = UniversalDataModule(tokenizer=tokenizer, + args=args, + collate_fn=collator) + module = PegasusChineseModel(args) + lr_monitor = LearningRateMonitor(logging_interval='step') + logger = loggers.TensorBoardLogger( + save_dir=os.path.join(args.default_root_dir, 'logs/'), + name=os.path.basename(os.path.dirname(args.model_path))) + checkpoint_callback = UniversalCheckpoint(args).callbacks + + # autotuning + if args.deepspeed is not None: + os.environ['PL_DEEPSPEED_CONFIG_PATH'] = args.deepspeed + + trainer = Trainer.from_argparse_args( + args, logger=logger, callbacks=[lr_monitor, checkpoint_callback]) + + trainer.fit(module, data_module) + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/pegasus/pretrain_pegasus.sh b/fengshen/examples/pegasus/pretrain_pegasus.sh new file mode 100644 index 0000000000000000000000000000000000000000..3a371ac45463317fa01fa84a72f5df6bb9ca6bd5 --- /dev/null +++ b/fengshen/examples/pegasus/pretrain_pegasus.sh @@ -0,0 +1,119 @@ +#!/bin/bash +#SBATCH --job-name=pegasus-base_last # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) + + +set -x -e + +echo "START TIME: $(date)" +MODEL_NAME=pegasus-base_test + +config_json="./$MODEL_NAME.ds_config.json" +export MASTER_PORT=$[RANDOM%10000+40000] + +MICRO_BATCH_SIZE=4 + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "zero_optimization": { + "stage": 1 + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "params": { + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-08, + "lr": 1e-04, + "weight_decay": 0.01 + }, + "type": "Adam" + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 80000000, + "warmup_num_steps" : 50000 + }, + "type": "WarmupDecayLR" + }, + "steps_per_print": 100, + "gradient_clipping": 1, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "zero_allow_untested_optimizer": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions + +DATA_ARGS="\ + --datasets_name wudao_180g_512 \ + --num_workers 20 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize 8 \ + --test_batchsize 8 \ + --max_seq_length 512 \ + --val_datasets_field valid \ + " + +MODEL_ARGS="\ + --model_path /cognitive_comp/dongxiaoqun/pretrained_model/pegasus-base/ \ + --learning_rate 1e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.001 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor train_loss \ + --save_top_k 3 \ + --mode min \ + --every_n_train_steps 200 \ + --dirpath /cognitive_comp/dongxiaoqun/train_model/fengshen-$MODEL_NAME_debug/ckpt \ + --filename model-{step:02d}-{train_loss:.4f} \ + --save_last \ + " + +TRAINER_ARGS="\ + --gradient_clip_val 1.0 \ + --max_epochs 1 \ + --gpus 2 \ + --num_nodes 1 \ + --strategy ddp \ + --log_every_n_steps 100 \ + --val_check_interval 0.1 \ + --accumulate_grad_batches 8 \ + --default_root_dir /cognitive_comp/dongxiaoqun/train_model/fengshen-$MODEL_NAME_debug \ + --stopword_path /cognitive_comp/dongxiaoqun/pretrained_model/pegasus-large/stopwords \ + " + + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +SINGULARITY_PATH=/cognitive_comp/dongxiaoqun/software/docker/pytorch21_06_py3_docker_image_v2.sif +export SCRIPT_PATH=/cognitive_comp/dongxiaoqun/project/idea-ccnl/bug_fix/Fengshenbang-LM/fengshen/examples/pegasus/pretrain_pegasus.py + +# python $SCRIPT_PATH $options +source activate +conda activate torchnew +srun --nodes=1 --ntasks-per-node=1 --gres=gpu:2 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=226191 bash -c 'python3 $SCRIPT_PATH $options' diff --git a/fengshen/examples/pegasus/tokenizers_pegasus.py b/fengshen/examples/pegasus/tokenizers_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..f532875987b59a42aca9ad35eb7a1945c992869b --- /dev/null +++ b/fengshen/examples/pegasus/tokenizers_pegasus.py @@ -0,0 +1,597 @@ +from fengshen.examples.pegasus.data_utils import ( + _is_control, + _is_punctuation, + _is_whitespace, + _is_chinese_char) +from transformers import PreTrainedTokenizer +from transformers import logging +from typing import List, Optional, Tuple, Union +import collections +import os +import unicodedata +import re +import jieba +import sys + +sys.path.append("../../../../") + +jieba.dt.tmp_dir = os.path.expanduser("~/.cache/") +# jieba.enable_parallel(8) +jieba.initialize() + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class PegasusTokenizer(PreTrainedTokenizer): + # copy from BertTokenizer + r""" + Construct a Pegasus tokenizer. Based on WordPiece. + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + # pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + pad_token="", + eos_token="", + unk_token="", + mask_token="", + mask_token_sent="", + additional_special_tokens=None, + sep_token="[SEP]", + cls_token="[CLS]", + tokenize_chinese_chars=True, + strip_accents=None, + offset=100, + pre_tokenizer=lambda x: jieba.cut(x, HMM=False), + **kwargs): + self.offset = offset + + if additional_special_tokens is not None: + if not isinstance(additional_special_tokens, list): + raise TypeError( + f"additional_special_tokens should be of type {type(list)}, \ + but is {type(additional_special_tokens)}" + ) + + additional_special_tokens_extended = ( + ([mask_token_sent] + additional_special_tokens) + if mask_token_sent not in additional_special_tokens + and mask_token_sent is not None else additional_special_tokens) + + # fill additional tokens with ..., in case not all additional tokens are already taken + additional_special_tokens_extended += [ + f"" for i in range( + len(additional_special_tokens_extended), self.offset - 1) + ] + + if len(set(additional_special_tokens_extended)) != len( + additional_special_tokens_extended): + raise ValueError( + f"Please make sure that the provided additional_special_tokens \ + do not contain an incorrectly shifted list of tokens. \ + Found {additional_special_tokens_extended}." + ) + additional_special_tokens = additional_special_tokens_extended + else: + additional_special_tokens = [ + mask_token_sent + ] if mask_token_sent is not None else [] + # additional_special_tokens += [f"" for i in range(3, self.offset)] + + # print("additional_special_tokens: ", additional_special_tokens) + + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. \ + To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + eos_token=eos_token, + tokenize_chinese_chars=tokenize_chinese_chars, + additional_special_tokens=additional_special_tokens, + strip_accents=strip_accents, + **kwargs, + ) + + self.pre_tokenizer = pre_tokenizer + self.mask_token_sent = mask_token_sent + self.vocab = load_vocab(vocab_file) + + self.vocab[self.eos_token] = self.vocab.pop("[unused1]") + # self.vocab[self.eos_token] = self.vocab.pop("[unused2]") + self.vocab[self.pad_token] = self.vocab.pop("[PAD]") + self.vocab[self.unk_token] = self.vocab.pop("[UNK]") + + if self.mask_token_sent is not None: + self.vocab[self.mask_token] = self.vocab.pop("[unused3]") + self.vocab[self.mask_token_sent] = self.vocab.pop("[unused2]") + + self.ids_to_tokens = collections.OrderedDict([ + (ids, tok) for tok, ids in self.vocab.items() + ]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, + unk_token=self.unk_token) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + split_tokens = [] + # print("pegasus_tokenizer: ", text) + for text in self.pre_tokenizer(text): + if text in self.vocab: + split_tokens.append(text) + else: + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens): + + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize( + token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + @staticmethod + def _cjk_punctuation(): + return u'\uff02\uff03\uff04\uff05\uff06\uff07\uff08\uff09\uff0a\uff0b\uff0c\uff0d\uff0f\uff1a\uff1b\uff1c\uff1d\ + \uff1e\uff20\uff3b\uff3c\uff3d\uff3e\uff3f\uff40\uff5b\uff5c\uff5d\uff5e\uff5f\uff60\uff62\ + \uff63\uff64\u3000\u3001\u3003\u3008\u3009\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011\u3014\ + \u3015\u3016\u3017\u3018\u3019\u301a\u301b\u301c\u301d\u301e\u301f\u3030\u303e\u303f\u2013\u2014\ + \u2018\u2019\u201b\u201c\u201d\u201e\u201f\u2026\u2027\ufe4f\ufe51\ufe54\u00b7\uff01\uff1f\uff61\u3002' + + def convert_ids_to_tokens( + self, + ids: Union[int, List[int]], + skip_special_tokens: bool = False) -> Union[str, List[str]]: + """ + Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and + added tokens. + Args: + ids (`int` or `List[int]`): + The token id (or token ids) to convert to tokens. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + Returns: + `str` or `List[str]`: The decoded token(s). + """ + if isinstance(ids, int): + if ids in self.added_tokens_decoder: + return self.added_tokens_decoder[ids] + else: + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + index = int(index) + if skip_special_tokens and index in self.all_special_ids and index != 2: + continue + if index in self.added_tokens_decoder: + tokens.append(self.added_tokens_decoder[index]) + else: + tokens.append(self._convert_id_to_token(index)) + return tokens + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + # for token in + # tokens = tokens or self.ids_to_tokens(ids) + # tokens = [token for token in tokens if not self._is_special(token)] + + text = '' + for i, token in enumerate(tokens): + if token[:2] == '##': + text += token[2:] + elif len(token) == 1 and _is_chinese_char(ord(token)): + text += token + elif len(token) == 1 and _is_punctuation(token): + text += token + text += ' ' + elif i > 0 and _is_chinese_char(ord(text[-1])): + text += token + elif tokens == "": + continue + else: + text += ' ' + text += token + + text = re.sub(' +', ' ', text) + text = re.sub('\' (re|m|s|t|ve|d|ll) ', '\'\\1 ', text) + punctuation = re.sub(' +', '', self._cjk_punctuation()).strip() + '+-/={(<[' + punctuation_regex = '|'.join([re.escape(p) for p in punctuation]) + punctuation_regex = '(%s) ' % punctuation_regex + text = re.sub(punctuation_regex, '\\1', text) + text = re.sub(r'(\d\.) (\d)', '\\1\\2', text) + + return text.strip() + # out_string = " ".join(tokens).replace(" ##", "").strip() + + def build_inputs_with_special_tokens( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating + and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence: + - single sequence: `X ` + - pair of sequences: `A B ` (not intended use) + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + [self.eos_token_id] + return token_ids_0 + token_ids_1 + [self.eos_token_id] + + def _special_token_mask(self, seq): + all_special_ids = set( + self.all_special_ids) # call it once instead of inside list comp + # all_special_ids.remove(self.unk_token_id) # is only sometimes special + + return [1 if x in all_special_ids else 0 for x in seq] + + def get_special_tokens_mask( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return self._special_token_mask(token_ids_0) + elif token_ids_1 is None: + return self._special_token_mask(token_ids_0) + [self.eos_token_id] + else: + return self._special_token_mask(token_ids_0 + + token_ids_1) + [self.eos_token_id] + + def num_special_tokens_to_add(self, pair=False): + """Just EOS""" + return 1 + + def save_vocabulary(self, + save_directory: str, + filename_prefix: Optional[str] = None) -> Tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + + VOCAB_FILES_NAMES["vocab_file"]) + else: + vocab_file = (filename_prefix + + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), + key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!") + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file, ) + + +class BasicTokenizer(object): + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents: (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + """ + + def __init__(self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see + WordPieceTokenizer. + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union( + set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens diff --git a/fengshen/examples/pretrain_bert/README.md b/fengshen/examples/pretrain_bert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1761095920188083853fb3df47927f0f9c008b76 --- /dev/null +++ b/fengshen/examples/pretrain_bert/README.md @@ -0,0 +1,78 @@ +# Bert预训练 + +## 背景 + +我们有持续收集了一部分语料,有一套自建的数据处理流程。位了验证数据处理的效果,从零开始预训练了2个base级别的Bert模型,一个是基于自建数据,一个是基于同行们开源的数据。总体来说数据效果差别不大,下面只介绍一下本次预训练的流程。 + +## 数据处理 + +我们的原始语料主要源自common crawl以及一些开源的高质量语料,经过一些列的数据清洗之后,我们的数据格式为jsonline。例如(摘自内部数据): +```json +{"text":"据悉,河南博物馆成立于1927年,目前拥有超过170000件(套)的文物收藏,包括Jiahu骨笛,雌性猫头鹰雕像,cloud-patterned铜禁,Duling Fangding,莲花和起重机广场,和玉柄剑,黄金从武则天滑落,四神云雾壁画和汝窑天蓝釉雕鹅颈瓶是九大镇厅的珍品。院中的藏品以史前文物、商周青铜器、陶瓷、玉器和石雕等为特色。高质量文物数量多、品种齐全、品位高、价值高。它们是见证中国文明发展、展示中国历史发展的文化艺术宝库。"} +{"text": "功夫不负有心人,1925年,万氏兄弟试制动画片初获成果,并获得了商务印书馆的大力支持。其后兄弟们再接再厉,直到1927年,一部黑白无声动画片《大闹画室》诞生了爱尔兰风笛。据《申报》记载,“该片内容画人与真人合作锁梦楼,滑稽处甚多,令人观后,捧腹不止。”此片曾远销美国放映,并大受赞誉。1930年夏俊娜,万古蟾到大中华百合影片公司工作,万氏兄弟采用了同样的手法拍摄了第二部动画短片《纸人捣乱记》,并于1931年上映。"} +``` + +处理脚本路径:`/cognitive_comp/wuziwei/codes/Fengshenbang-LM/fengshen/data/bert_dataloader` + +该路径下面有3个文件,`auto_split.sh`和`preprocessing.py`是原始数据预处理的脚本,`load.py是fs_data`的处理脚本,执行顺序如下: + +#### step 1 + +执行`auto_split.sh`文件,作用是分割大文件,超过1GB的文件,会自动分割未300M的小文件。使用方法如下: + +`sh auto_split.sh 你的数据文件路径` + +#### step 2 + +执行`preprocessing.py`文件,该文件的作用主要是分句,为什么不嵌入到collate_fn中做,是发现那样效率会慢一些,所以单独拿出来做了。 +执行`python preprocessing.py`即可,注意修改脚本内的文件路径。 + +#### step 3 + +`load.py`文件是用fsdata的方式加载数据集,也是执行即可。执行一遍,后续的加载可以实现180GB的数据秒入~ + +前面两步是为了提高load.py文件生成缓存文件的速度。经过这几步的处理以及collate_fn函数(bert mask 策略的实现),最终变成bert的输入。如下: + +*ps: collate_fn在`Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.py`脚本下,由DataCollate类实现。* + +```json +{ +"input_ids": torch.tensor(input_ids), +"labels": torch.tensor(batch_labels), +"attention_mask": torch.tensor(attention_mask), +"token_type_ids": torch.tensor(token_type_ids) +} +``` + +## 模型结构 + +模型结构即为标准的bert-base,即: +| 配置 | 参数 | +| :---------: | :---: | +| nlayers | 12 | +| nheaders | 12 | +| hidden-size | 768 | +| seq-length | 512 | +| vocab-size | 21128 | + +## 任务以及Mask策略 + +*mask策略的实现在`Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.py`的**DataCollate**类中* + +本次预训练取消了NSP任务,只做mask任务,具体mask策略如下: + +- 15%随机mask + - 80% mask + - 10% 随机替换 + - 10% 保持不变 +- 全词mask (wwm) +- n-gram mask + +由于加入了全词mask和n-gram mask 总体的mask token数量会比英文原始论文的mask比例略高 + +## 预训练执行流程 + +- 训练框架:[Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM) +- 脚本执行:`sh Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.sh` + +*具体配置见`Fengshenbang-LM\fengshen\examples\pretrain_bert\pretrain_bert.sh`* diff --git a/fengshen/examples/pretrain_bert/pretrain_bert.py b/fengshen/examples/pretrain_bert/pretrain_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..a07d7020e10503c4a2b15cfa8456de3264bd13f4 --- /dev/null +++ b/fengshen/examples/pretrain_bert/pretrain_bert.py @@ -0,0 +1,278 @@ +from data.bert_dataloader.load import BertDataModule +from transformers import ( + BertTokenizer, + BertConfig, + BertForPreTraining, + BertModel, + BertForMaskedLM +) +from pytorch_lightning import ( + LightningDataModule, + LightningModule, + loggers, + Trainer, +) +from pytorch_lightning.callbacks import ( + ModelCheckpoint, + LearningRateMonitor, +) +from typing import Optional +from torch.utils.data import DataLoader +from transformers.optimization import get_linear_schedule_with_warmup +import argparse +import sys +import torch +import os +import re +import jieba +import numpy as np + +# 如果没有安装fengshen模块,请把Fengshenbang-LM/fengshen加入到系统环境变量 +sys.path.insert(0, '../../../fengshen') + +os.environ["CUDA_VISIBLE_DEVICES"] = '0,1' + + +class DataCollate(object): + + def __init__(self, tokenizer, max_length, mask_rate=0.15, max_ngram=3, if_padding=True) -> None: + self.tokenizer = tokenizer + self.max_length = max_length + self.word_cuter = jieba.cut + self.vocab_length = len(tokenizer) + self.mask_rate = mask_rate + self.ignore_labels = -100 + self.ngrams = np.arange(1, max_ngram + 1, dtype=np.int64) + pvals = 1. / np.arange(1, max_ngram + 1) + pvals /= pvals.sum(keepdims=True) # p(n) = 1/n / sigma(1/k) + self.pvals = pvals + self.padding = if_padding + + def token_process(self, token_id): + rand = np.random.random() + if rand <= 0.8: + return self.tokenizer.mask_token_id + elif rand <= 0.9: + return token_id + else: + return np.random.randint(1, self.vocab_length) + + def __call__(self, samples): + input_ids = [] + attention_mask = [] + token_type_ids = [] + batch_labels = [] + # print('^-^ batch size :',len(samples)) + for sample in samples: + word_list = list(self.word_cuter(sample['text'])) + mask_ids, labels = [], [] + + record = [] + for i in range(len(word_list)): + rands = np.random.random() + if i in record: + continue + word = word_list[i] + if rands > self.mask_rate and len(word) < 4: + word = word_list[i] + word_encode = tokenizer.encode(word, add_special_tokens=False) + for token in word_encode: + mask_ids.append(token) + labels.append(self.ignore_labels) + record.append(i) + else: + n = np.random.choice(self.ngrams, p=self.pvals) + for index in range(n): + ind = index + i + if ind in record or ind >= len(word_list): + continue + record.append(ind) + word = word_list[ind] + word_encode = tokenizer.encode(word, add_special_tokens=False) + for token in word_encode: + mask_ids.append(self.token_process(token)) + labels.append(token) + if self.padding: + if len(mask_ids) > self.max_length: + input_ids.append(mask_ids[:self.max_length]) + batch_labels.append(labels[:self.max_length]) + else: + lenght = len(mask_ids) + mask_ids.extend([0]*(self.max_length-lenght)) + labels.extend([-100]*(self.max_length-lenght)) + input_ids.append(mask_ids) + batch_labels.append(labels) + attention_mask.append([1]*self.max_length) + token_type_ids.append([0]*self.max_length) + + # print('sentence:',sample['text']) + # print('input_ids:',mask_ids) + # print('decode inputids:',self.tokenizer.decode(mask_ids)) + # print('labels',labels) + # print('decode labels:',self.tokenizer.decode(labels)) + # print('*'*20) + return { + 'input_ids': torch.tensor(input_ids), + 'labels': torch.tensor(batch_labels), + 'attention_mask': torch.tensor(attention_mask), + 'token_type_ids': torch.tensor(token_type_ids) + } + + +class Bert(LightningModule): + @staticmethod + def add_module_specific_args(args_parser): + parser = args_parser.add_argument_group('Bert') + parser.add_argument('--model_path', type=str, default='') + parser.add_argument('--learning_rate', default=1e-5, type=float) + parser.add_argument('--weight_decay', default=0.1, type=float) + parser.add_argument('--warmup', default=0.01, type=float) + return args_parser + + def __init__(self, args): + super().__init__() + self.save_hyperparameters(args) + self.bertconfig = BertConfig.from_pretrained(args.model_path) + # self.model = BertForPreTraining(self.bertconfig) + self.model = BertForMaskedLM(self.bertconfig) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus) + ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs) + self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size + + def configure_optimizers(self): + + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + paras = list( + filter(lambda p: p[1].requires_grad, self.named_parameters())) + paras = [{ + 'params': + [p for n, p in paras if not any(nd in n for nd in no_decay)], + 'weight_decay': self.hparams.weight_decay + }, { + 'params': [p for n, p in paras if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0 + }] + optimizer = torch.optim.AdamW(paras, lr=self.hparams.learning_rate) + scheduler = get_linear_schedule_with_warmup( + optimizer, int(self.total_steps * self.hparams.warmup), + self.total_steps) + + return [{ + 'optimizer': optimizer, + 'lr_scheduler': { + 'scheduler': scheduler, + 'interval': 'step', + 'frequency': 1 + } + }] + + def training_step(self, batch, batch_idx): + output = self.model(**batch) + # print(output) + self.log('train_loss', output.loss) + return output.loss + + def comput_metrix(self, logits, labels): + ones = torch.ones_like(labels) + zero = torch.zeros_like(labels) + mask = torch.where(labels < 0, zero, ones) + mask = mask.view(size=(-1,)).float() + # y_true=labels.view(size=(-1,)).float() + + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + corr = torch.multiply(corr.float(), mask) + acc = torch.sum(corr.float()) / torch.sum(mask) + return acc + + def validation_step(self, batch, batch_idx): + output = self.model(**batch) + # print(output) + acc = self.comput_metrix(output.logits, batch['labels']) + print('val_loss ', output.loss) + self.log('val_loss', output.loss) + self.log('val_acc', acc) + # pass + + def predict_step(self, batch, batch_idx): + output = self.model(**batch) + return output.prediction_logits + + +class CustomCKPT: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('ckpt call back') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./ckpt/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + parser.add_argument('--save_last', action='store_true', default=True) + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', action='store_true', default=False) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename, + save_last=args.save_last) + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = BertDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = Bert.add_module_specific_args(args_parser) + args_parser = CustomCKPT.add_argparse_args(args_parser) + args_parser.add_argument('--deepspeed') + args_parser.add_argument('--seq_max_length') + + args = args_parser.parse_args() + + tokenizer = BertTokenizer.from_pretrained(args.model_path) + collate_fn = DataCollate(tokenizer, 512) + data_module = BertDataModule(tokenizer=tokenizer, args=args, collate_fn=collate_fn) + + print('data load complete') + + model = Bert(args) + print('model load complete') + + lr_monitor = LearningRateMonitor(logging_interval='step') + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'logs/'), + name=os.path.basename(os.path.dirname(args.model_path))) + checkpoint_callback = CustomCKPT(args).callbacks + + if args.resume_from_checkpoint is not None and \ + not os.path.exists(args.resume_from_checkpoint): + print('--------warning no checkpoint found--------, remove args') + del args.resume_from_checkpoint + + # autotuning + if args.deepspeed is not None: + os.environ['PL_DEEPSPEED_CONFIG_PATH'] = args.deepspeed + + trainer = Trainer.from_argparse_args(args, logger=logger, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + trainer.fit(model, data_module) diff --git a/fengshen/examples/pretrain_bert/pretrain_bert.sh b/fengshen/examples/pretrain_bert/pretrain_bert.sh new file mode 100644 index 0000000000000000000000000000000000000000..f6e6453826d1c6408de4a7e064a7756529b0c6cd --- /dev/null +++ b/fengshen/examples/pretrain_bert/pretrain_bert.sh @@ -0,0 +1,116 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_bart # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + + +MODEL_NAME=bert-1.3B + +config_json="./$MODEL_NAME.ds_config.json" +((MASTER_PORT=$RANDOM%10000+40000)) +echo $MASTER_PORT +ZERO_STAGE=2 +MICRO_BATCH_SIZE=16 + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": true, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "allgather_bucket_size": 2e8 + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "params": { + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-08, + "lr": 1e-04, + "weight_decay": 0.01 + }, + "type": "Adam" + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 536877, + "warmup_num_steps" : 50000 + }, + "type": "WarmupDecayLR" + }, + "steps_per_print": 100, + "gradient_clipping": 1, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "zero_allow_untested_optimizer": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/home/wuziwei/torch_extendsions + +DATA_ARGS="\ + --datasets_name wudao_180g \ + --num_workers 16 \ + --train_batchsize $MICRO_BATCH_SIZE + " + +MODEL_ARGS="\ + --model_path /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/wudao180g_bert_base \ + --learning_rate 1e-5 \ + --weight_decay 0.01 \ + --warmup 0.001 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor train_loss \ + --save_top_k 3 \ + --mode min \ + --save_last \ + --every_n_train_steps 5000 \ + --dirpath /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/$MODEL_NAME \ + --filename model-{step:02d}-{train_loss:.4f} \ + " +TRAINER_ARGS="\ + --gradient_clip_val 1.0 \ + --max_epochs 1 \ + --gpus 2 \ + --num_nodes 1 \ + --strategy ddp \ + --log_every_n_steps 100 \ + --val_check_interval 0.1 \ + --check_val_every_n_epoch 1 \ + --accumulate_grad_batches 1 \ + --resume_from_checkpoint /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/$MODEL_NAME/last.ckpt \ + --default_root_dir /data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/$MODEL_NAME \ + " + + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +export SCRIPT_PATH=/data0/wuziwei/codes/Fengshenbang-LM/fengshen/examples/pretrain_bert/pretrain_bert.py + +bash -c 'python3 $SCRIPT_PATH $options' + diff --git a/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen.py b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen.py new file mode 100644 index 0000000000000000000000000000000000000000..1487abb15a7419b6c00056b6fcd78e96c8125d8b --- /dev/null +++ b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen.py @@ -0,0 +1,237 @@ +from dataclasses import dataclass +from transformers import ( + MegatronBertConfig, + MegatronBertForPreTraining, + AutoTokenizer, +) +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +import argparse +import torch +import os +import numpy as np +import time +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.data.data_utils.sop_utils import get_a_and_b_segments +from fengshen.data.data_utils.truncate_utils import truncate_segments +from fengshen.data.data_utils.token_type_utils import create_tokens_and_tokentypes +from fengshen.data.data_utils.mask_utils import create_masked_lm_predictions +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from torch.utils.data._utils.collate import default_collate + +SHOW_DATA = False + + +@dataclass +class ErLangShenCollator: + ''' + 由input处理成samples,也就是最终模型的输入 + 其中主要处理逻辑在__call__里 + 包含Mask和Sop任务 + ''' + tokenizer: None # 分词 + max_seq_length: 512 + masked_lm_prob: 0.15 + content_key: str = 'text' + # 一些预处理操作 + + def setup(self): + from fengshen.data.data_utils.sentence_split import ChineseSentenceSplitter + self.sentence_split = ChineseSentenceSplitter() + self.np_rng = np.random.RandomState(seed=((int(time.time()) % 2**32))) + inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()} + self.vocab_id_list = list(inv_vocab.keys()) + self.vocab_id_to_token_dict = inv_vocab + + def __call__(self, samples): + ''' + samples: 一个sample长这样{"text": "hello world"} + ''' + model_inputs = [] + for s in samples: + sentences = self.sentence_split.tokenize(s[self.content_key]) + # Divide sample into two segments (A and B). + tokenized_sentences = [self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(sent)) for sent in sentences] + if len(tokenized_sentences) == 0: + print('find empty sentence') + continue + if len(tokenized_sentences) > 1: + tokens_a, tokens_b, is_next_random = get_a_and_b_segments(tokenized_sentences, + self.np_rng) + else: + tokens_a = tokenized_sentences[0] + tokens_b = [] + is_next_random = False + # max_seq_length - 3因为还需要拼上[CLS] [SEP] [SEP] + if len(tokens_a) == 0: + continue + _ = truncate_segments(tokens_a, tokens_b, len(tokens_a), + len(tokens_b), self.max_seq_length-3, self.np_rng) + # Build tokens and toketypes. + tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, tokens_b, + self.tokenizer.cls_token_id, self.tokenizer.sep_token_id) + # Masking. + max_predictions_per_seq = self.masked_lm_prob * len(tokens) + (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions( + tokens, self.vocab_id_list, self.vocab_id_to_token_dict, self.masked_lm_prob, + self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.mask_token_id, + max_predictions_per_seq, self.np_rng, + masking_style='bert') + + # Some checks. + num_tokens = len(tokens) + padding_length = self.max_seq_length - num_tokens + assert padding_length >= 0 + assert len(tokentypes) == num_tokens + assert len(masked_positions) == len(masked_labels) + + # Tokens and token types. + filler = [self.tokenizer.pad_token_id] * padding_length + tokens_np = np.array(tokens + filler, dtype=np.int64) + tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) + + # Padding mask. + padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, + dtype=np.int64) + + # Lables and loss mask. + labels = [-100] * self.max_seq_length + for i in range(len(masked_positions)): + assert masked_positions[i] < num_tokens + labels[masked_positions[i]] = masked_labels[i] + labels_np = np.array(labels, dtype=np.int64) + model_inputs.append( + { + 'input_ids': tokens_np, + 'attention_mask': padding_mask_np, + 'token_type_ids': tokentypes_np, + 'labels': labels_np, + 'next_sentence_label': int(is_next_random) + } + ) + return default_collate(model_inputs) + + +class ErLangShenBert(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Erlangshen Bert') + parser.add_argument('--masked_lm_prob', type=float, default=0.15) + parser.add_argument('--max_seq_length', type=int, default=512) + parser.add_argument('--sample_content_key', type=str, default='text') + return parent_parser + + def __init__(self, args, tokenizer, **kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + config = MegatronBertConfig.from_pretrained(args.model_path) + self.config = config + self.tokenizer = tokenizer + self.model = MegatronBertForPreTraining(config) + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + return configure_optimizers(self) + + def forward(self, **batch): + return self.model(**batch) + + def detokenize(self, token_ids): + toks = self.tokenizer.convert_ids_to_tokens(token_ids) + return self.tokenizer.convert_tokens_to_string(toks) + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.shape[0] + return acc + + def training_step(self, batch, batch_idx): + if self.trainer.global_rank == 0: + global SHOW_DATA + if not SHOW_DATA: + print(self.config) + print(self.model) + SHOW_DATA = True + print('source: {}'.format(batch['input_ids'][0])) + print('target: {}'.format(batch['labels'][0])) + print('source: {}'.format(self.detokenize(batch['input_ids'][0]))) + label_idx = batch['labels'][0] != -100 + print('target: {}'.format(self.detokenize( + batch['labels'][0][label_idx]))) + output = self(**batch) + self.log('train_loss', output.loss, sync_dist=True) + label_idx = batch['labels'] != -100 + acc = self.comput_metrix( + output.prediction_logits[label_idx].view(-1, output.prediction_logits.size(-1)), batch['labels'][label_idx]) + self.log('train_acc', acc, sync_dist=True) + return output.loss + + def validation_step(self, batch, batch_idx): + output = self(**batch) + self.log('val_loss', output.loss, sync_dist=True) + return output.loss + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = ErLangShenBert.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + collate_fn = ErLangShenCollator( + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + masked_lm_prob=args.masked_lm_prob, + content_key=args.sample_content_key, + ) + collate_fn.setup() + data_module = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collate_fn) + print('data load complete') + + model = ErLangShenBert(args, tokenizer=tokenizer) + print('model load complete') + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + # 做兼容,如果目录不存在的话把这个参数去掉,不然会报错 + if args.load_ckpt_path is not None and \ + not os.path.exists(args.load_ckpt_path): + print('--------warning no checkpoint found--------, remove args') + args.load_ckpt_path = None + + trainer = Trainer.from_argparse_args(args, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + trainer.fit(model, data_module, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen_base.sh b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen_base.sh new file mode 100644 index 0000000000000000000000000000000000000000..d3368c20dc1d5d287bef0619e341b35cc6228362 --- /dev/null +++ b/fengshen/examples/pretrain_erlangshen_bert/pretrain_erlangshen_base.sh @@ -0,0 +1,87 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_bart # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen +ROOT_DIR=../../workspace +export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions + +MODEL_NAME=erlangshen-bert-base +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} +fi + +NNODES=1 +GPUS_PER_NODE=1 + +MICRO_BATCH_SIZE=32 + +# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin +CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json" +ZERO_STAGE=1 +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $CONFIG_JSON +{ + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "fp16": { + "enabled": true + }, + "gradient_clipping": 2, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE +} +EOT +export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON +### End + +DATA_ARGS="\ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --datasets_name IDEA-CCNL/PretrainCorpusDemo \ + " +# 如果你有一批数据,可以参照IDEA-CCNL/PretrainCorpusDemo的格式处理,通过参数传入 +# --train_file train.json +# --val_file val.json +# --test_file test.json + +MODEL_ARGS="\ + --model_path $MODEL_ROOT_DIR/pretrain \ + --learning_rate 1e-4 \ + --weight_decay 1e-1 \ + --warmup_ratio 0.01 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --save_last \ + --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ + --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ + " + +TRAINER_ARGS="\ + --max_epoch 1 \ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --log_every_n_steps 1 \ + --precision 16 \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --replace_sampler_ddp False \ + " + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +python3 pretrain_erlangshen.py $options diff --git a/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta.py b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta.py new file mode 100644 index 0000000000000000000000000000000000000000..e6bd2f81781c5bfcdd55aa1514104f8dec5d8f50 --- /dev/null +++ b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta.py @@ -0,0 +1,227 @@ +from dataclasses import dataclass +from transformers import ( + DebertaV2Config, + DebertaV2ForMaskedLM, + AutoTokenizer, +) +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +import argparse +import torch +import os +import numpy as np +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.data.data_utils.truncate_utils import truncate_segments +from fengshen.data.data_utils.token_type_utils import create_tokens_and_tokentypes +from fengshen.data.data_utils.mask_utils import create_masked_lm_predictions +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from torch.utils.data._utils.collate import default_collate + +SHOW_DATA = False + + +@dataclass +class DeBERTaV2Collator: + ''' + 由input处理成samples,也就是最终模型的输入 + 其中主要处理逻辑在__call__里 + 包含Mask任务,使用Whole Word Mask + ''' + tokenizer: None # 分词 + max_seq_length: 512 + masked_lm_prob: 0.15 + content_key: str = 'text' + # 一些预处理操作 + + def setup(self): + self.np_rng = np.random.RandomState(seed=42) + inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()} + self.vocab_id_list = list(inv_vocab.keys()) + self.vocab_id_to_token_dict = inv_vocab + import jieba_fast + self.zh_tokenizer = jieba_fast.lcut + + def __call__(self, samples): + ''' + samples: 一个sample长这样{"text": "hello world"} + ''' + model_inputs = [] + for s in samples: + tokenized_sentences = self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(s[self.content_key])) + if len(tokenized_sentences) == 0: + print('find empty sentence') + continue + tokens_a = tokenized_sentences + # max_seq_length - 3因为还需要拼上[CLS] [SEP] [SEP] + if len(tokens_a) == 0: + continue + _ = truncate_segments(tokens_a, [], len(tokens_a), + 0, self.max_seq_length-3, self.np_rng) + # Build tokens and toketypes. + tokens, tokentypes = create_tokens_and_tokentypes(tokens_a, [], + self.tokenizer.cls_token_id, self.tokenizer.sep_token_id) + # Masking. + max_predictions_per_seq = self.masked_lm_prob * len(tokens) + (tokens, masked_positions, masked_labels, _, _) = create_masked_lm_predictions( + tokens, self.vocab_id_list, self.vocab_id_to_token_dict, self.masked_lm_prob, + self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.mask_token_id, + max_predictions_per_seq, self.np_rng, + masking_style='bert', + zh_tokenizer=self.zh_tokenizer) + + # Some checks. + num_tokens = len(tokens) + padding_length = self.max_seq_length - num_tokens + assert padding_length >= 0 + assert len(tokentypes) == num_tokens + assert len(masked_positions) == len(masked_labels) + + # Tokens and token types. + filler = [self.tokenizer.pad_token_id] * padding_length + tokens_np = np.array(tokens + filler, dtype=np.int64) + tokentypes_np = np.array(tokentypes + filler, dtype=np.int64) + + # Padding mask. + padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, + dtype=np.int64) + + # Lables and loss mask. + labels = [-100] * self.max_seq_length + for i in range(len(masked_positions)): + assert masked_positions[i] < num_tokens + labels[masked_positions[i]] = masked_labels[i] + labels_np = np.array(labels, dtype=np.int64) + model_inputs.append( + { + 'input_ids': tokens_np, + 'attention_mask': padding_mask_np, + 'token_type_ids': tokentypes_np, + 'labels': labels_np, + } + ) + return default_collate(model_inputs) + + +class ErlangshenDeBERTaV2(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Erlangshen Bert') + parser.add_argument('--masked_lm_prob', type=float, default=0.15) + parser.add_argument('--max_seq_length', type=int, default=512) + parser.add_argument('--sample_content_key', type=str, default='text') + return parent_parser + + def __init__(self, args, tokenizer, **kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + config = DebertaV2Config.from_pretrained(args.model_path) + self.config = config + self.tokenizer = tokenizer + self.model = DebertaV2ForMaskedLM(config) + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + return configure_optimizers(self) + + def forward(self, **batch): + return self.model(**batch) + + def detokenize(self, token_ids): + toks = self.tokenizer.convert_ids_to_tokens(token_ids) + return self.tokenizer.convert_tokens_to_string(toks) + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.shape[0] + return acc + + def training_step(self, batch, batch_idx): + if self.trainer.global_rank == 0: + global SHOW_DATA + if not SHOW_DATA: + print(self.config) + print(self.model) + SHOW_DATA = True + print('source: {}'.format(batch['input_ids'][0])) + print('target: {}'.format(batch['labels'][0])) + print('source: {}'.format(self.detokenize(batch['input_ids'][0]))) + label_idx = batch['labels'][0] != -100 + print('target: {}'.format(self.detokenize( + batch['labels'][0][label_idx]))) + output = self(**batch) + self.log('train_loss', output.loss, sync_dist=True) + label_idx = batch['labels'] != -100 + acc = self.comput_metrix( + output.logits[label_idx].view(-1, output.logits.size(-1)), batch['labels'][label_idx]) + self.log('train_acc', acc, sync_dist=True) + return output.loss + + def validation_step(self, batch, batch_idx): + output = self(**batch) + self.log('val_loss', output.loss, sync_dist=True) + return output.loss + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = ErlangshenDeBERTaV2.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + collate_fn = DeBERTaV2Collator( + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + masked_lm_prob=args.masked_lm_prob, + content_key=args.sample_content_key, + ) + collate_fn.setup() + data_module = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collate_fn) + print('data load complete') + + model = ErlangshenDeBERTaV2(args, tokenizer=tokenizer) + print('model load complete') + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + # 做兼容,如果目录不存在的话把这个参数去掉,不然会报错 + if args.load_ckpt_path is not None and \ + not os.path.exists(args.load_ckpt_path): + print('--------warning no checkpoint found--------, remove args') + args.load_ckpt_path = None + + trainer = Trainer.from_argparse_args(args, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + trainer.fit(model, data_module, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta_base.sh b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta_base.sh new file mode 100644 index 0000000000000000000000000000000000000000..bf6ad5cb30f14173854aa66bf91d731151ec47d7 --- /dev/null +++ b/fengshen/examples/pretrain_erlangshen_deberta_v2/pretrain_deberta_base.sh @@ -0,0 +1,88 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_bart # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen +ROOT_DIR=../../workspace +export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions + +MODEL_NAME=erlangshen-deberta-base +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} +fi + +NNODES=1 +GPUS_PER_NODE=1 + +MICRO_BATCH_SIZE=32 + +# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin +CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json" +ZERO_STAGE=1 +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $CONFIG_JSON +{ + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "fp16": { + "enabled": true + }, + "gradient_clipping": 1, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE +} +EOT +export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON +### End + +DATA_ARGS="\ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --datasets_name IDEA-CCNL/PretrainCorpusDemo \ + " +# 如果你有一批数据,可以参照IDEA-CCNL/PretrainCorpusDemo的格式处理,通过参数传入 +# --train_file train.json +# --val_file val.json +# --test_file test.json + +MODEL_ARGS="\ + --model_path $MODEL_ROOT_DIR/pretrain \ + --learning_rate 1e-4 \ + --weight_decay 1e-1 \ + --warmup_ratio 0.01 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --save_last \ + --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ + --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ + " + +TRAINER_ARGS="\ + --max_epoch 10 \ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --log_every_n_steps 1 \ + --precision 16 \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --replace_sampler_ddp False \ + " + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +python3 pretrain_deberta.py $options +#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain_deberta.py $options diff --git a/fengshen/examples/pretrain_randeng_bart/pretrain_bart.py b/fengshen/examples/pretrain_randeng_bart/pretrain_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c779de17c7b990b05e0e189cc1c486b8678115 --- /dev/null +++ b/fengshen/examples/pretrain_randeng_bart/pretrain_bart.py @@ -0,0 +1,281 @@ +from transformers import AutoTokenizer, BartForConditionalGeneration, BartConfig +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.callbacks import LearningRateMonitor +from dataclasses import dataclass +import os +import argparse +import torch +import math +import time +from torch.utils.data._utils.collate import default_collate +from fengshen.data.data_utils.mask_utils import create_masked_lm_predictions +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.utils import UniversalCheckpoint +from fengshen.models.model_utils import ( + get_total_steps, + configure_optimizers, + add_module_args, +) +import numpy as np +SHOW_DATA = False + + +@ dataclass +class BartCollator: + ''' + 由input处理成samples,也就是最终模型的输入 + 其中主要处理逻辑在__call__里 + 包含text infilling和sentence shuffle任务 + ''' + tokenizer: None # 分词 + max_seq_length: 512 + masked_lm_prob: 0.15 + permute_sentence_ratio: 1.0 + content_key: str = 'text' + + def setup(self): + from fengshen.data.data_utils.sentence_split import ChineseSentenceSplitter + self.sentence_split = ChineseSentenceSplitter() + self.np_rng = np.random.RandomState(seed=((int(time.time()) % 2**32))) + inv_vocab = {v: k for k, v in self.tokenizer.vocab.items()} + self.vocab_id_list = list(inv_vocab.keys()) + self.vocab_id_to_token_dict = inv_vocab + import jieba_fast + self.zh_tokenizer = jieba_fast.lcut + seg_tokens = ['。', ';', ';', '!', '!', '?', '?'] + seg_token_ids = [] + for t in seg_tokens: + if t in self.tokenizer.vocab: + seg_token_ids.append(self.tokenizer.vocab[t]) + else: + print('seg_token "{}" not in vocab'.format(t)) + self.seg_token_ids = set(seg_token_ids) + + def permute_sentences(self, source, full_stops, p=1.0): + # Tokens that are full stops, where the previous token is not + sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2 + result = source.clone() + + num_sentences = sentence_ends.size(0) + num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0) + substitutions = torch.randperm(num_sentences)[:num_to_permute] + ordering = torch.arange(0, num_sentences) + ordering[substitutions] = substitutions[torch.randperm(num_to_permute)] + + # Ignore at start + index = 1 + for i in ordering: + sentence = source[(sentence_ends[i - 1] if i > 0 else 1): sentence_ends[i]] + result[index: index + sentence.size(0)] = sentence + index += sentence.size(0) + return result + + def __call__(self, samples): + ''' + samples: 一个sample长这样{"text": "hello world"} + ''' + model_inputs = [] + for s in samples: + sentences = self.sentence_split.tokenize(s[self.content_key]) + tokenized_sentences = [self.tokenizer.convert_tokens_to_ids( + self.tokenizer.tokenize(sent)) for sent in sentences] + if len(tokenized_sentences) == 0: + print('find empty sentence') + continue + + tokens = [self.tokenizer.cls_token_id] + for sent in tokenized_sentences: + for t in sent: + tokens.append(t) + if tokens[-1] != self.tokenizer.sep_token_id: + tokens.append(self.tokenizer.sep_token_id) + + if len(tokens) > self.max_seq_length: + # 找到最后的一句话,如果有的话,尽量保证最后一句话的完整 + last_pos = self.max_seq_length - 1 + for i in range(self.max_seq_length - 1, 0, -1): + if tokens[i-1] in self.seg_token_ids: + last_pos = i + break + tokens = tokens[:last_pos] + + tokens.append(self.tokenizer.sep_token_id) + tokens = torch.LongTensor(tokens) + + full_stops = torch.any(torch.stack([torch.eq(tokens, aelem).logical_or_( + torch.eq(tokens, aelem)) for aelem in self.seg_token_ids], dim=0), dim=0) + + assert (self.max_seq_length - + tokens.shape[0]) >= 0, (tokens.size(), tokens[-1], self.max_seq_length) + + source, target = tokens, tokens.clone() + + if self.permute_sentence_ratio > 0.0: + source = self.permute_sentences(source, full_stops, self.permute_sentence_ratio) + + if self.masked_lm_prob > 0.0: + mask_prob = self.masked_lm_prob * 2 + max_predictions_per_seq = mask_prob * len(source) + (source, _, _, _, _) = create_masked_lm_predictions( + source.numpy(), self.vocab_id_list, self.vocab_id_to_token_dict, mask_prob, + self.tokenizer.cls_token_id, self.tokenizer.sep_token_id, self.tokenizer.mask_token_id, + max_predictions_per_seq, self.np_rng, + masking_style='bert', zh_tokenizer=self.zh_tokenizer) + # 合并[MASK] 因为这里用的是Bert的mask函数,Bert是按字mask的, + # 这里把连续的mask合并成一个MASK从而达到span mask的效果 + span_mask_souce = [] + for t in source: + # 如果是连续的多个mask,则跳过 + if len(span_mask_souce) > 0 \ + and t is self.tokenizer.mask_token_id \ + and span_mask_souce[-1] is self.tokenizer.mask_token_id: + continue + span_mask_souce.append(t) + + source = torch.LongTensor(span_mask_souce) + + assert (source >= 0).all() + # assert (source[1:-1] >= 1).all(), source + assert (source <= self.tokenizer.vocab_size).all() + assert source[0] == self.tokenizer.cls_token_id + assert source[-1] == self.tokenizer.sep_token_id + + prev_output_tokens = torch.zeros_like(target) + # match the preprocessing in fairseq + prev_output_tokens[0] = self.tokenizer.sep_token_id + prev_output_tokens[1:] = target[:-1] + + source_ = torch.full((self.max_seq_length,), + self.tokenizer.pad_token_id, dtype=torch.long) + source_[:source.shape[0]] = source + target_ = torch.full((self.max_seq_length,), -100, dtype=torch.long) + target_[:target.shape[0]] = target + prev_output_tokens_ = torch.full( + (self.max_seq_length,), self.tokenizer.pad_token_id, dtype=torch.long) + prev_output_tokens_[:prev_output_tokens.shape[0]] = prev_output_tokens + attention_mask = torch.full((self.max_seq_length,), 0, dtype=torch.long) + attention_mask[:source.shape[0]] = 1 + model_inputs.append({ + "input_ids": source_, + "labels": target_, + "decoder_input_ids": prev_output_tokens_, + "attention_mask": attention_mask, + }) + return default_collate(model_inputs) + + +class RandengBart(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Randeng BART') + parser.add_argument('--masked_lm_prob', type=float, default=0.15) + parser.add_argument('--max_seq_length', type=int, default=512) + parser.add_argument('--sample_content_key', type=str, default='text') + parser.add_argument('--permute_sentence_ratio', type=str, default=1.0) + return parent_parser + + def __init__(self, args, tokenizer, **kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + config = BartConfig.from_pretrained(args.model_path) + self.model = BartForConditionalGeneration(config) + self.tokenizer = tokenizer + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + + def configure_optimizers(self): + return configure_optimizers(self) + + def detokenize(self, token_ids): + toks = self.tokenizer.convert_ids_to_tokens(token_ids) + return self.tokenizer.convert_tokens_to_string(toks) + + def training_step(self, batch, batch_idx): + if self.trainer.global_rank == 0: + global SHOW_DATA + if not SHOW_DATA: + SHOW_DATA = True + print('source: {}'.format(batch['input_ids'][0])) + print('target: {}'.format(batch['labels'][0])) + print('decoder source: {}'.format(batch['decoder_input_ids'][0])) + + print('source: {}'.format(self.detokenize(batch['input_ids'][0]))) + print('decoder source: {}'.format(self.detokenize(batch['decoder_input_ids'][0]))) + label_idx = batch['labels'][0] != -100 + print('target: {}'.format(self.detokenize( + batch['labels'][0][label_idx]))) + output = self.model(**batch) + acc = self.comput_metrix(output.logits, batch['labels']) + self.log('train_loss', output.loss, sync_dist=True) + self.log('train_acc', acc, sync_dist=True) + return output.loss + + def comput_metrix(self, logits, labels): + label_idx = labels != -100 + labels = labels[label_idx] + logits = logits[label_idx].view(-1, logits.size(-1)) + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.shape[0] + return acc + + def validation_step(self, batch, batch_idx): + output = self.model(**batch) + acc = self.comput_metrix(output.logits, batch['labels']) + self.log('val_loss', output.loss, sync_dist=True) + self.log('val_acc', acc, sync_dist=True) + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = RandengBart.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + + collator = BartCollator( + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + masked_lm_prob=args.masked_lm_prob, + content_key=args.sample_content_key, + permute_sentence_ratio=args.permute_sentence_ratio, + ) + # 准备一些额外参数 + collator.setup() + data_module = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collator) + + module = RandengBart(args, tokenizer=tokenizer) + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + # 做兼容,如果目录不存在的话把这个参数去掉,不然会报错 + if args.load_ckpt_path is not None and \ + not os.path.exists(args.load_ckpt_path): + print('--------warning no checkpoint found--------, remove args') + args.load_ckpt_path = None + + trainer = Trainer.from_argparse_args(args, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + trainer.fit(module, data_module, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/examples/pretrain_randeng_bart/pretrain_bart_base.sh b/fengshen/examples/pretrain_randeng_bart/pretrain_bart_base.sh new file mode 100644 index 0000000000000000000000000000000000000000..2ac4d8d40a2135c7439c150d7b208f94ba002a0d --- /dev/null +++ b/fengshen/examples/pretrain_randeng_bart/pretrain_bart_base.sh @@ -0,0 +1,87 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_bart # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen +ROOT_DIR=../../workspace +export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions + +MODEL_NAME=randeng-bart-base +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} +fi + +NNODES=1 +GPUS_PER_NODE=1 + +MICRO_BATCH_SIZE=32 + +# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin +CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json" +ZERO_STAGE=1 +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $CONFIG_JSON +{ + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "fp16": { + "enabled": true + }, + "gradient_clipping": 1, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE +} +EOT +export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON +### End + +DATA_ARGS="\ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + " +# 如果你有一批数据,可以参照IDEA-CCNL/PretrainCorpusDemo的格式处理,通过参数传入 +# --train_file train.json +# --val_file val.json +# --test_file test.json + +MODEL_ARGS="\ + --model_path $MODEL_ROOT_DIR/pretrain \ + --learning_rate 1e-4 \ + --weight_decay 1e-1 \ + --warmup_ratio 0.01 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --save_last \ + --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ + --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ + " + +TRAINER_ARGS="\ + --max_epoch 10 \ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --log_every_n_steps 1 \ + --precision 16 \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --replace_sampler_ddp False \ + " + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " + +python3 pretrain_bart.py $options +#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain_bart.py $options diff --git a/fengshen/examples/pretrain_t5/convert_ckpt_randeng_t5_char.sh b/fengshen/examples/pretrain_t5/convert_ckpt_randeng_t5_char.sh new file mode 100644 index 0000000000000000000000000000000000000000..5c446fd8784477d1caa1519b614d759aa3cb6ec8 --- /dev/null +++ b/fengshen/examples/pretrain_t5/convert_ckpt_randeng_t5_char.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -x -e + +echo "START TIME: $(date)" +BIN_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/randeng_t5_char_57M +if [ ! -d ${BIN_DIR} ];then + mkdir ${BIN_DIR} + echo ${BIN_DIR} created!!!!!!!!!!!!!! +else + echo ${BIN_DIR} exist!!!!!!!!!!!!!!! +fi + +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + + +MODEL_ARGS=" + --ckpt_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/ckpt/last.ckpt/checkpoint/mp_rank_00_model_states.pt \ + --bin_path ${BIN_DIR}/pytorch_model.bin \ + --rm_prefix module.model. \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py + +export CMD=" \ + $SCRIPTS_PATH \ + $MODEL_ARGS \ + " + +echo $CMD +/home/ganruyi/anaconda3/bin/python $CMD diff --git a/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py b/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py new file mode 100644 index 0000000000000000000000000000000000000000..2aeef8c860864d138b0c970baca72a568bf51a19 --- /dev/null +++ b/fengshen/examples/pretrain_t5/convert_ckpt_to_bin.py @@ -0,0 +1,37 @@ +import time +from builtins import print +import argparse + +import torch +# os.environ["CUDA_VISIBLE_DEVICES"] = '3' + + +def get_time_str(): + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + + +def main(): + total_parser = argparse.ArgumentParser("Pretrain Unsupervise.") + total_parser.add_argument('--ckpt_path', default=None, type=str) + total_parser.add_argument('--bin_path', default=None, type=str) + total_parser.add_argument('--rm_prefix', default=None, type=str) + # * Args for base model + args = total_parser.parse_args() + print('Argument parse success.') + state_dict = torch.load(args.ckpt_path)['module'] + new_state_dict = {} + + if args.rm_prefix is not None: + prefix_len = len(args.rm_prefix) + for k, v in state_dict.items(): + if k[:prefix_len] == args.rm_prefix: + new_state_dict[k[prefix_len:]] = v + else: + new_state_dict[k] = v + else: + new_state_dict = state_dict + torch.save(new_state_dict, args.bin_path) + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/pretrain_t5/finetune_t5.py b/fengshen/examples/pretrain_t5/finetune_t5.py new file mode 100644 index 0000000000000000000000000000000000000000..497b1ca26817d2c1dbf8d1be4b5cea51ad846f4e --- /dev/null +++ b/fengshen/examples/pretrain_t5/finetune_t5.py @@ -0,0 +1,144 @@ +import time +from builtins import print +import sys +import os +import torch +import argparse +import pytorch_lightning as pl +from pytorch_lightning import Trainer, loggers +from transformers import MT5ForConditionalGeneration +from pytorch_lightning.callbacks import LearningRateMonitor +# os.environ["CUDA_VISIBLE_DEVICES"] = '3' + + +class MT5FinetuneModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--keep_tokens_path', default=None, type=str) + return parent_args + + def __init__(self, args): + super().__init__() + self.save_hyperparameters(args) + self.model = MT5ForConditionalGeneration.from_pretrained( + args.pretrained_model_path + ) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs) + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + def training_step(self, batch, batch_idx): + output = self.model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=batch['labels']) + acc = self.comput_metrix(output.logits, batch['labels']) + self.log('train_loss', output.loss, sync_dist=True) + self.log('train_acc', acc, sync_dist=True) + return output.loss + + def validation_step(self, batch, batch_idx): + # print('is out of index: ', batch['input_ids'][batch['input_ids'] >= 32598]) + output = self.model( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=batch['labels']) + acc = self.comput_metrix(output.logits, batch['labels']) + cond_output = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + force_words_ids=batch['force_words_ids'], + num_beams=2, + ) + cond_acc = self.comput_metrix(cond_output, batch['labels']) + self.log('val_loss', output.loss, sync_dist=True) + self.log('val_acc', acc, sync_dist=True) + self.log('cond_acc', cond_acc, sync_dist=True) + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/y_true.shape[0] + return acc + + def on_save_checkpoint(self, checkpoint) -> None: + # Save the current loop info in the mid of epoch + # if you lightning <= 1.6.0 uncomment the line below + # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict() + if self.trainer.global_rank == 0 and self.trainer.global_step % self.hparams.every_n_train_steps == 0: + self.model.save_pretrained(os.path.join( + self.trainer.checkpoint_callback.dirpath, + 'hf_pretrained_epoch{}_step{}'.format(self.trainer.current_epoch, self.trainer.global_step))) + + def on_load_checkpoint(self, checkpoint) -> None: + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +def get_time_str(): + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + + +def main(): + total_parser = argparse.ArgumentParser("Pretrain Unsupervise.") + total_parser.add_argument( + '--do_eval_only', action='store_true', default=False) + total_parser.add_argument( + '--pretrained_model_path', default=None, type=str) + total_parser.add_argument( + '--new_vocab_path', default=None, type=str) + total_parser.add_argument('--max_seq_length', default=1024, type=int) + total_parser.add_argument('--ckpt_path', default=None, type=str) + sys.path.append('../../../') + from fengshen.data.t5_dataloader.t5_datasets import TaskT5DataModel + from fengshen.utils.universal_checkpoint import UniversalCheckpoint + # * Args for data preprocessing + total_parser = TaskT5DataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = UniversalCheckpoint.add_argparse_args(total_parser) + total_parser = MT5FinetuneModel.add_model_specific_args(total_parser) + # * Args for base model + args = total_parser.parse_args() + print('Argument parse success.') + print('TaskT5DataModel load start {}'.format(get_time_str())) + data_model = TaskT5DataModel(args) + print('TaskT5DataModel load end {}'.format(get_time_str())) + if not args.do_eval_only: + model = MT5FinetuneModel(args) + checkpoint_callback = UniversalCheckpoint(args) + lr_monitor = LearningRateMonitor(logging_interval='step') + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'logs/')) + trainer = Trainer.from_argparse_args(args, + logger=logger, + callbacks=[checkpoint_callback, lr_monitor] + ) + trainer.fit(model, data_model, ckpt_path=args.ckpt_path) + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/pretrain_t5/finetune_unimc_randeng_t5_char_57M.sh b/fengshen/examples/pretrain_t5/finetune_unimc_randeng_t5_char_57M.sh new file mode 100644 index 0000000000000000000000000000000000000000..fccf833bdc954707bdc94d6bef3821239006a2c6 --- /dev/null +++ b/fengshen/examples/pretrain_t5/finetune_unimc_randeng_t5_char_57M.sh @@ -0,0 +1,129 @@ +#!/bin/bash +#SBATCH --job-name=finetune_unimc_randeng_t5_char_57M +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.log +#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=64 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/finetune_unimc_randeng_t5_char_57M/ +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.finetune_unimc_randeng_t5_char_57M.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] +export CUDA_VISIBLE_DEVICES='6' + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 240000, + "warmup_num_steps" : 10000 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --every_n_train_steps 100000 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.1 \ + --dataset_num_workers 4 \ + --dataloader_num_workers 4 \ + --replace_sampler_ddp False \ +" +# --accumulate_grad_batches 8 \ +TRAIN_DATA_DIR=/cognitive_comp/yangping/data/unidata/multiplechoice/pretraining_alldata/alldata/train.json +VALID_DATA_DIR=/cognitive_comp/yangping/data/unidata/multiplechoice/pretraining_alldata/alldata/dev.json + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data_path ${TRAIN_DATA_DIR} \ + --valid_data_path ${TRAIN_DATA_DIR} \ + --max_seq_length 512 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/randeng_t5_char_57M \ + --tokenizer_type bert_tokenizer \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/finetune_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +/home/ganruyi/anaconda3/bin/python $CMD +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' + +# source activate base +# python $CMD +# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD + diff --git a/fengshen/examples/pretrain_t5/pretrain_mt5_small.sh b/fengshen/examples/pretrain_t5/pretrain_mt5_small.sh new file mode 100644 index 0000000000000000000000000000000000000000..4e9d49e3a83d9a886890740179a9ae3739a58654 --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_mt5_small.sh @@ -0,0 +1,124 @@ +#!/bin/bash +#SBATCH --job-name=randeng_t5_77M +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o %x-%j.log +#SBATCH -e %x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=64 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_77M/ + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.t5_cn_small_pretrain.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 100000, + "warmup_num_steps" : 10000 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 8 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --every_n_train_steps 50000 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.01 \ + --preprocessing_num_workers 20 \ +" +# --accumulate_grad_batches 8 \ +DATA_DIR=wudao_180g_t5_tokenized_512 + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data ${DATA_DIR} \ + --train_split_size 0.999 \ + --max_seq_length 512 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-small \ + --new_vocab_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model \ + --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \ +" +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +# source activate base +# python $CMD +# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD + +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +srun --jobid=171866 --job-name=randeng_t5_77M --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 -e %x-%j.err -o %x-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' + + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" +# salloc --nodes=1 --gres=gpu:2 --cpus-per-gpu=20 -t 24:00:00 +# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' +# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python -u -m debugpy --listen 192.168.190.2:53005 --wait-for-client $CMD' \ No newline at end of file diff --git a/fengshen/examples/pretrain_t5/pretrain_mt5_small_continue.sh b/fengshen/examples/pretrain_t5/pretrain_mt5_small_continue.sh new file mode 100644 index 0000000000000000000000000000000000000000..0a539a7e6a7fb4b750b441df98dd49f166c3c49b --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_mt5_small_continue.sh @@ -0,0 +1,120 @@ +#!/bin/bash +#SBATCH --job-name=t5_cn_small_pretrain_v2 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o %x-%j.log +#SBATCH -e %x-%j.err +#SBATCH -x dgx050 + +set -x -e +source activate base + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=32 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/t5_cn_small_pretrain_v2/ + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.t5_cn_small_pretrain_v2.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() + +cat < $config_json +{ + "zero_optimization": { + "stage": 1 + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "params": { + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-08, + "lr": 1e-04, + "weight_decay": 0.01 + }, + "type": "AdamW" + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 0, + "warmup_max_lr": 1e-4, + "warmup_num_steps": 10000 + } + }, + "steps_per_print": 100, + "gradient_clipping": 1, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "zero_allow_untested_optimizer": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 8 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --every_n_train_steps 0 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.01 \ + --preprocessing_num_workers 20 \ +" +# --accumulate_grad_batches 8 \ +DATA_DIR=wudao_180g_mt5_tokenized + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data ${DATA_DIR} \ + --train_split_size 0.999 \ + --max_seq_length 1024 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/experiments/t5_cn_small_pretrain/Randeng-T5-77M \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \ +" +# --resume_from_checkpoint /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/ckpt/last.ckpt \ + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" +# salloc --nodes=1 --gres=gpu:2 --cpus-per-gpu=20 -t 24:00:00 +clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' +# clear; srun --job-name=t5_cn_small_pretrain_v2 --jobid=153124 --nodes=1 --ntasks-per-node=8 --gres=gpu:8 --cpus-per-task=30 -o %x-%j.log -e %x-%j.err singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' diff --git a/fengshen/examples/pretrain_t5/pretrain_mt5_small_predict.sh b/fengshen/examples/pretrain_t5/pretrain_mt5_small_predict.sh new file mode 100644 index 0000000000000000000000000000000000000000..be643bb12ddf613e99a5f6ac3bd23f3ab0773a33 --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_mt5_small_predict.sh @@ -0,0 +1,126 @@ +#!/bin/bash +#SBATCH --job-name=t5_cn_small_pretrain +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/%x-%j.log +#SBATCH -e /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/%x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=128 +ROOT_DIR=/cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/ + +ZERO_STAGE=2 + +config_json="$ROOT_DIR/ds_config.t5_cn_small_pretrain.json" +export MASTER_PORT=$[RANDOM%10000+30000] +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": 128, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-4, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 0, + "warmup_max_lr": 1e-4, + "warmup_num_steps": 10000 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_2 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 10 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.01 \ + --accumulate_grad_batches 8 \ + --resume_from_checkpoint /cognitive_comp/ganruyi/fengshen/t5_cn_small_pretrain/old-ckpt/last.ckpt \ + --do_eval_only \ +" +# --accumulate_grad_batches 8 \ +DATA_DIR=wudao_180g_mt5_tokenized + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data wudao_180g_mt5_tokenized\ + --train_split_size 0.999 \ + --max_seq_length 1024 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-small \ + --new_vocab_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/fengshen/pretrain_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" +# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' +/home/ganruyi/anaconda3/bin/python $CMD \ No newline at end of file diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_10B.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_10B.sh new file mode 100644 index 0000000000000000000000000000000000000000..6b85b4886dffc191c6d4856f66c2b3fd51817f69 --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_10B.sh @@ -0,0 +1,129 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_randeng_t5_char_10B +#SBATCH --nodes=4 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/%x-%j.log +#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/%x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=1 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/ +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=2 + +config_json="$ROOT_DIR/ds_config.randeng_t5_char_10B.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] +export CUDA_VISIBLE_DEVICES='1,2,3,4' + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "cpu_offload": true, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 100000, + "warmup_num_steps" : 10000 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_${ZERO_STAGE} + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 4 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --every_n_train_steps 1000000 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.1 \ + --dataset_num_workers 4 \ + --dataloader_num_workers 4 \ + --replace_sampler_ddp False \ +" +# --accumulate_grad_batches 8 \ +DATA_DIR=wudao_180g_bert_tokenized_512 + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data_path ${DATA_DIR} \ + --train_split_size 0.999 \ + --max_seq_length 512 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_10B/randeng_t5_char_10B \ + --tokenizer_type bert_tokenizer \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +/home/ganruyi/anaconda3/bin/python $CMD +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' + +# source activate base +# python $CMD +# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD + diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_57M.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_57M.sh new file mode 100644 index 0000000000000000000000000000000000000000..8e86e8b077019a57c5a6ac28ab29749f1a2787aa --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_57M.sh @@ -0,0 +1,128 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_randeng_t5_char_57M +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=32 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.log +#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/%x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=64 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/ +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.randeng_t5_char_57M.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] +# export CUDA_VISIBLE_DEVICES='4,5' + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 240000, + "warmup_num_steps" : 10000 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 8 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --every_n_train_steps 100000 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.1 \ + --dataset_num_workers 4 \ + --dataloader_num_workers 4 \ + --replace_sampler_ddp False \ +" +# --accumulate_grad_batches 8 \ +DATA_DIR=wudao_180g_bert_tokenized_512 + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data_path ${DATA_DIR} \ + --train_split_size 0.999 \ + --max_seq_length 512 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_57M/randeng_t5_char_57M \ + --tokenizer_type bert_tokenizer \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +/home/ganruyi/anaconda3/bin/python $CMD +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' + +# source activate base +# python $CMD +# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD + diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_700M.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_700M.sh new file mode 100644 index 0000000000000000000000000000000000000000..5b3b2c6c87831ebce78d4f7e0ed133b7a8468ba2 --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_char_700M.sh @@ -0,0 +1,129 @@ +#!/bin/bash +#SBATCH --job-name=pretrain_randeng_t5_char_700M +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/%x-%j.log +#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/%x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=8 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/ +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.randeng_t5_char_700M.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] +# export CUDA_VISIBLE_DEVICES='2,5' + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 400000, + "warmup_num_steps" : 10000 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 8 \ + --num_nodes 2 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --every_n_train_steps 100000 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.1 \ + --dataset_num_workers 4 \ + --dataloader_num_workers 4 \ + --replace_sampler_ddp False \ + --accumulate_grad_batches 2 \ +" +# --accumulate_grad_batches 8 \ +DATA_DIR=wudao_180g_bert_tokenized_512 + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data_path ${DATA_DIR} \ + --train_split_size 0.999 \ + --max_seq_length 512 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_700M/randeng_t5_char_700M \ + --tokenizer_type bert_tokenizer \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +# /home/ganruyi/anaconda3/bin/python $CMD +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' + +# source activate base +# python $CMD +# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD + diff --git a/fengshen/examples/pretrain_t5/pretrain_randeng_t5_large.sh b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_large.sh new file mode 100644 index 0000000000000000000000000000000000000000..a91d7082a4c945fe78a2fb0ce99be7c7d9a02745 --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_randeng_t5_large.sh @@ -0,0 +1,132 @@ +#!/bin/bash +#SBATCH --job-name=randeng_t5_large +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o %x-%j.log +#SBATCH -e %x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=8 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/randeng_t5_large_v2/ +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.randeng_t5_large_pretrain.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 100000, + "warmup_num_steps" : 10000 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 8 \ + --num_nodes 2 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --every_n_train_steps 1000000 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --val_check_interval 0.01 \ + --preprocessing_num_workers 20 \ +" +# --accumulate_grad_batches 8 \ +DATA_DIR=wudao_180g_t5_tokenized_512 + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data ${DATA_DIR} \ + --train_split_size 0.999 \ + --max_seq_length 512 \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/hf_models/google/mt5-large \ + --new_vocab_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model \ + --keep_tokens_path /cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn_keep_tokens.json \ +" +# --ckpt_path /cognitive_comp/ganruyi/experiments/randeng_t5_large/ckpt/last.ckpt \ + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/pretrain_t5.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +# source activate base +# python $CMD +# srun --nodes=1 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 --jobid=171866 -e %x-%j.err -o %x-%j.log python $CMD + +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +srun --jobid=172781 --job-name=randeng_t5_large --nodes=2 --gres=gpu:8 --ntasks-per-node=8 --cpus-per-task=30 -e randeng_t5_large-%j.err -o randeng_t5_large-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' + + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" +# salloc --nodes=1 --gres=gpu:2 --cpus-per-gpu=20 -t 24:00:00 +# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' +# clear; srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python -u -m debugpy --listen 192.168.190.2:53005 --wait-for-client $CMD' \ No newline at end of file diff --git a/fengshen/examples/pretrain_t5/pretrain_t5.py b/fengshen/examples/pretrain_t5/pretrain_t5.py new file mode 100644 index 0000000000000000000000000000000000000000..7a95bc8781ca5f4e0fa3ef0cb1eea98e5d4abbe6 --- /dev/null +++ b/fengshen/examples/pretrain_t5/pretrain_t5.py @@ -0,0 +1,175 @@ +import time +from builtins import print +import sys +import os +import torch +import argparse +import json +import pytorch_lightning as pl +from transformers import MT5Config, MT5Tokenizer +from pytorch_lightning import Trainer, loggers +from transformers import MT5ForConditionalGeneration +from pytorch_lightning.callbacks import LearningRateMonitor +# os.environ["CUDA_VISIBLE_DEVICES"] = '3' + + +class MT5PretrainModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--keep_tokens_path', default=None, type=str) + return parent_args + + def __init__(self, args): + super().__init__() + self.save_hyperparameters(args) + if args.tokenizer_type == 't5_tokenizer': + if args.new_vocab_path is not None: + # 用于从mt5继续训练,此时只保留中英文词表,spm采用新模型 + assert args.keep_tokens_path is not None + keep_tokens = json.load(open(args.keep_tokens_path)) + self.model = MT5ForConditionalGeneration.from_pretrained( + args.pretrained_model_path) + new_config = self.model.config + new_config.vocab_size = len(keep_tokens) + print('vocab_size:', new_config.vocab_size) + + new_state_dict = self.model.state_dict() + select_index = torch.tensor(keep_tokens) + new_state_dict['encoder.embed_tokens.weight'] = torch.index_select( + new_state_dict['encoder.embed_tokens.weight'], dim=0, index=select_index) + new_state_dict['shared.weight'] = torch.index_select( + new_state_dict['shared.weight'], dim=0, index=select_index) + new_state_dict['decoder.embed_tokens.weight'] = torch.index_select( + new_state_dict['decoder.embed_tokens.weight'], dim=0, index=select_index) + new_state_dict['lm_head.weight'] = torch.index_select( + new_state_dict['lm_head.weight'], dim=0, index=select_index) + self.model = MT5ForConditionalGeneration.from_pretrained( + args.pretrained_model_path, config=new_config, state_dict=new_state_dict) + # self.model = MT5ForConditionalGeneration(config=new_config) + else: + # 用于继续训练 + self.model = MT5ForConditionalGeneration.from_pretrained( + args.pretrained_model_path + ) + else: + self.model = MT5ForConditionalGeneration( + MT5Config.from_pretrained(args.pretrained_model_path) + ) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs) + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + def training_step(self, batch, batch_idx): + output = self.model( + input_ids=batch['input_ids'], labels=batch['labels']) + acc = self.comput_metrix(output.logits, batch['labels']) + self.log('train_loss', output.loss, sync_dist=True) + self.log('train_acc', acc, sync_dist=True) + return output.loss + + def validation_step(self, batch, batch_idx): + # print('is out of index: ', batch['input_ids'][batch['input_ids'] >= 32598]) + output = self.model( + input_ids=batch['input_ids'], labels=batch['labels']) + acc = self.comput_metrix(output.logits, batch['labels']) + self.log('val_loss', output.loss, sync_dist=True) + self.log('val_acc', acc, sync_dist=True) + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/y_true.shape[0] + return acc + + def on_save_checkpoint(self, checkpoint) -> None: + # Save the current loop info in the mid of epoch + # if you lightning <= 1.6.0 uncomment the line below + # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict() + if self.trainer.global_rank == 0 and self.trainer.global_step % self.hparams.every_n_train_steps == 0: + self.model.save_pretrained(os.path.join( + self.trainer.checkpoint_callback.dirpath, + 'hf_pretrained_epoch{}_step{}'.format(self.trainer.current_epoch, self.trainer.global_step))) + + def on_load_checkpoint(self, checkpoint) -> None: + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +def get_time_str(): + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + + +def main(): + total_parser = argparse.ArgumentParser("Pretrain Unsupervise.") + total_parser.add_argument( + '--do_eval_only', action='store_true', default=False) + total_parser.add_argument( + '--pretrained_model_path', default=None, type=str) + total_parser.add_argument( + '--new_vocab_path', default=None, type=str) + total_parser.add_argument('--max_seq_length', default=1024, type=int) + total_parser.add_argument('--ckpt_path', default=None, type=str) + sys.path.append('../../../') + from fengshen.data.t5_dataloader.t5_datasets import UnsuperviseT5DataModel + from fengshen.utils.universal_checkpoint import UniversalCheckpoint + # * Args for data preprocessing + total_parser = UnsuperviseT5DataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = UniversalCheckpoint.add_argparse_args(total_parser) + total_parser = MT5PretrainModel.add_model_specific_args(total_parser) + # * Args for base model + args = total_parser.parse_args() + print('Argument parse success.') + print('UnsuperviseT5DataModel load start {}'.format(get_time_str())) + data_model = UnsuperviseT5DataModel(args) + print('UnsuperviseT5DataModel load end {}'.format(get_time_str())) + if not args.do_eval_only: + model = MT5PretrainModel(args) + checkpoint_callback = UniversalCheckpoint(args) + lr_monitor = LearningRateMonitor(logging_interval='step') + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'logs/')) + trainer = Trainer.from_argparse_args(args, + logger=logger, + callbacks=[checkpoint_callback, lr_monitor] + ) + trainer.fit(model, data_model, ckpt_path=args.ckpt_path) + else: + tokenizer = MT5Tokenizer.from_pretrained(args.new_vocab_path, extra_ids=0) + model = MT5PretrainModel(args=args, num_data=len(data_model.predict_dataloader())) + trainer = Trainer.from_argparse_args(args) + + result = trainer.predict(model, data_model) + result = result[0] + for i in range(4): + print(tokenizer.batch_decode(result['input_ids'][i])) + print(tokenizer.batch_decode(result['predict_ids'][i])) + print(tokenizer.batch_decode(result['labels'][i])) + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/pretrain_t5/process_data.py b/fengshen/examples/pretrain_t5/process_data.py new file mode 100644 index 0000000000000000000000000000000000000000..bae164f107f7ec3569227f3e40a292ee1641fd21 --- /dev/null +++ b/fengshen/examples/pretrain_t5/process_data.py @@ -0,0 +1,65 @@ +# coding=utf8 +import argparse +import sys +import os +from concurrent.futures import ProcessPoolExecutor + + +def _generate_cache_arrow(index, ds, path): + print('saving dataset shard {}'.format(index)) + ds.save_to_disk(os.path.join(path, 'part_{}'.format(index))) + return 'saving dataset shard {} done'.format(index) + + +def generate_arrow_cache(ds, args) -> None: + ''' + 读取wudao_180g等原数据或者tokenized之后的数据,并进行train test split + 同时利用seed 42做shuffle 缓存下来 + ''' + ds = ds.train_test_split(train_size=args.train_split_size, seed=42) + print(ds) + p = ProcessPoolExecutor(max_workers=args.preprocessing_num_workers) + res = [] + train_shard_part = args.saved_data_shards + for i in range(0, train_shard_part): + res.append(p.submit(_generate_cache_arrow, i, + ds['train'].shard(train_shard_part, i), args.saved_train_data_path)) + + p.shutdown(wait=True) + for future in res: + print(future.result(), flush=True) + + ds['test'].save_to_disk(args.saved_test_data_path) + print('done') + + +if __name__ == '__main__': + total_parser = argparse.ArgumentParser("Save data Task") + total_parser.add_argument( + '--new_vocab_path', default='/cognitive_comp/ganruyi/hf_models/t5_cn_small/sentencepiece_cn.model', type=str) + total_parser.add_argument('--preprocessing_num_workers', default=30, type=int) + total_parser.add_argument( + '--train_data_path', default='/cognitive_comp/common_data/test_wudao_180g_mt5_tokenized/', type=str) + total_parser.add_argument('--saved_data_shards', default=800, type=int) + total_parser.add_argument('--saved_train_data_path', default=None, type=str) + total_parser.add_argument('--saved_test_data_path', default=None, type=str) + total_parser.add_argument('--max_seq_length', default=512, type=int) + total_parser.add_argument('--train_split_size', default=0.999, type=float) + total_parser.add_argument('--pretrained_model_path', default=None, type=str) + total_parser.add_argument('--tokenizer_type', default='t5_tokenizer', choices=['t5_tokenizer', 'bert_tokenizer']) + total_parser.add_argument('--text_column_name', default='text') + total_parser.add_argument('--remove_columns', nargs='+', default=[]) + + # * Args for data preprocessing + args = total_parser.parse_args() + sys.path.append('../../../') + from fengshen.data.t5_dataloader.t5_datasets import UnsuperviseT5Dataset + ds = UnsuperviseT5Dataset(args.train_data_path, args) + print(ds) + generate_arrow_cache(ds.data, args=args) + # ds = UnsuperviseT5Dataset(args.train_data_path, args, load_data_type=0) + for i in range(0, 2): + print(ds.data[i]) + print(ds.tokenizer.decode(ds.data[i]['input_ids'])) + + print(ds.data) diff --git a/fengshen/examples/pretrain_t5/process_data_bert_tokenizer.sh b/fengshen/examples/pretrain_t5/process_data_bert_tokenizer.sh new file mode 100644 index 0000000000000000000000000000000000000000..b17187c6a26c0a5edf46cf2d9c5736338e6ff934 --- /dev/null +++ b/fengshen/examples/pretrain_t5/process_data_bert_tokenizer.sh @@ -0,0 +1,36 @@ +#!/bin/bash +#SBATCH --job-name=process_data_bert_tokenizer +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # number of gpus +#SBATCH --cpus-per-task=120 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o /cognitive_comp/ganruyi/experiments/randeng_t5_char_77M/%x-%j.log +#SBATCH -e /cognitive_comp/ganruyi/experiments/randeng_t5_char_77M/%x-%j.err +set -x -e + +echo "START TIME: $(date)" + +DATA_ARGS=" + --tokenizer_type bert_tokenizer \ + --train_data_path wudao_180g \ + --train_split_size 0.999 \ + --max_seq_length 512 \ + --preprocessing_num_workers 100 \ + --saved_data_shards 800 \ + --saved_train_data_path /cognitive_comp/common_data/wudao_180g_bert_tokenized_512_train/ \ + --saved_test_data_path /cognitive_comp/common_data/wudao_180g_bert_tokenized_512_test/ \ + --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_char_77M/randeng_t5_char_77M \ + --text_column_name text \ + --remove_columns token_type_ids text \ +" + + # --remove_columns text \ +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/pretrain_t5/process_data.py + +export CMD=" \ + $SCRIPTS_PATH \ + $DATA_ARGS \ + " + +echo $CMD +source activate base +/home/ganruyi/anaconda3/bin/python $CMD \ No newline at end of file diff --git a/fengshen/examples/pretrain_taiyi_clip/flickr_datasets.py b/fengshen/examples/pretrain_taiyi_clip/flickr_datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..530b74122b46f33bfa3de5cf536963f3538a9d40 --- /dev/null +++ b/fengshen/examples/pretrain_taiyi_clip/flickr_datasets.py @@ -0,0 +1,35 @@ +# 这里这个dataset只是临时测试用的,所以暂时用最简陋的方式放在这里,后续会优化 +from torch.utils.data import Dataset +from PIL import Image + + +class flickr30k_CNA(Dataset): + def __init__(self, img_root_path=None, + text_annot_path=None, + data_process_fn=None): + self.images = [] + self.captions = [] + self.labels = [] + self.root = img_root_path + with open(text_annot_path, 'r') as f: + for line in f: + line = line.strip().split('\t') + key, caption = line[0].split('#')[0], line[1] + img_path = key + '.jpg' + self.images.append(img_path) + self.captions.append(caption) + self.labels.append(key) + self.data_process_fn = data_process_fn + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img_path = str(self.root + "/" + self.images[idx]) + instance_image = Image.open(img_path) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + captions = self.captions[idx] + label = self.labels[idx] + image, text = self.data_process_fn(instance_image, captions) + return image, text, label diff --git a/fengshen/examples/pretrain_taiyi_clip/pretrain.py b/fengshen/examples/pretrain_taiyi_clip/pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..56e24ac370ff2b5f3ecf84a32586bc5205499b07 --- /dev/null +++ b/fengshen/examples/pretrain_taiyi_clip/pretrain.py @@ -0,0 +1,308 @@ +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +from fengshen.models.clip import ( + TaiyiCLIPModel, + TaiyiCLIPProcessor, +) +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +import torch +import torch.nn.functional as F +import argparse +import math +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +import os +import numpy as np +from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor + +OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) +OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + + +class Collator(): + def __init__(self, args, processor): + self.processor = processor + self.seq_length = args.seq_length + self.transforms = Compose([ + ToTensor(), + RandomResizedCrop(args.resolution, scale=(0.9, 1.0), + interpolation=InterpolationMode.BICUBIC), + Normalize(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD), + ]) + + def __call__(self, inputs): + max_length = min(self.seq_length, max([len(i['caption']) for i in inputs])) + images = [] + texts = [] + labels = [] + for i in inputs: + # instance_image = Image.open(i['img_path']) + # instance_image = jpeg4py.JPEG(i['img_path']).decode() + instance_image = np.load(i['npy_path']) + images.append(self.transforms(instance_image)) + texts.append(i['caption']) + labels.append(i['labels'] if 'labels' in i else -100) + # images_input = self.processor(images=images, return_tensors="pt") + texts_input = self.processor(text=texts, + max_length=max_length, + padding='max_length', + truncation=True, + return_tensors='pt') + # return images_input, texts_input, labels + return {'pixel_values': torch.stack(images)}, texts_input, labels + + +class TaiyiCLIP(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Taiyi CLIP') + parser.add_argument('--loss_type', choices=['local', 'global'], default='local') + parser.add_argument('--seq_length', default=77) + parser.add_argument('--gather_with_grad', default=False, action='store_true') + parser.add_argument('--freeze_image_tower', default=False, action='store_true') + return parent_parser + + def __init__(self, args, **kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + + self.model = TaiyiCLIPModel.from_pretrained(args.model_path) + self.processor = TaiyiCLIPProcessor.from_pretrained(args.model_path) + + self.local_loss = args.loss_type == 'local' + + if args.freeze_image_tower: + for param in self.model.vision_model.parameters(): + param.requires_grad = False + self.model.visual_projection.requires_grad = False + + # cache + self.cache_labels = True + self.prev_num_logits = 0 + self.labels = {} + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + print('Total steps: {}' .format(self.total_steps)) + elif stage == 'validate': + self.total_steps = 100 + + def configure_optimizers(self): + return configure_optimizers(self) + + def forward(self, image, text): + assert image is not None + assert text is not None + image_features = self.model.get_image_features(**image) + text_features = self.model.get_text_features(**text) + + image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) + text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) + + return image_features, text_features, self.model.logit_scale.exp() + + def gather_features(self, features): + if self.trainer.world_size == 1: + return features + all_features = self.all_gather( + features, sync_grads=self.hparams.gather_with_grad) + if not self.local_loss and not self.gather_with_grad: + # 如果是全局loss,并且不需要梯度,需要把梯度更新回tensor + all_features[self.global_rank] = features + all_features = all_features.view(-1, all_features.shape[-1]) + return all_features + + def clip_loss(self, image_features, text_features, logit_scale): + + logits_per_image = None + + # 如果我冻住VIT并且是local_loss,那么我只需要自己的这部分text feature就行 + # 因为根本不需要image2text的feature训练VIT + if self.hparams.freeze_image_tower and self.local_loss: + all_text_features = None + else: + all_text_features = self.gather_features( + text_features) + all_image_features = self.gather_features( + image_features) + + if self.local_loss: + if all_text_features is not None: + logits_per_image = logit_scale * image_features @ all_text_features.T + logits_per_text = logit_scale * text_features @ all_image_features.T + else: + # 如果是global_loss,那all_text_features肯定不是空的 + logits_per_image = logit_scale * all_image_features @ all_text_features.T + logits_per_text = logits_per_image.T + + num_logits = logits_per_text.shape[0] + if self.prev_num_logits != num_logits or self.device not in self.labels: + labels = torch.arange(num_logits, device=self.device, dtype=torch.long) + if self.trainer.world_size > 1 and self.local_loss: + labels = labels + num_logits * self.global_rank + if self.cache_labels: + self.labels[self.device] = labels + self.prev_num_logits = num_logits + else: + labels = self.labels[self.device] + + total_loss = ( + F.cross_entropy(logits_per_image, labels) + + F.cross_entropy(logits_per_text, labels) + ) / 2 if logits_per_image is not None else F.cross_entropy(logits_per_text, labels) + return total_loss + + def training_step(self, batch): + image, text, _ = batch + image_features, text_features, logit_scale = self(image, text) + total_loss = self.clip_loss(image_features, text_features, logit_scale) + self.log('train_loss', total_loss, sync_dist=False) + return total_loss + + def on_train_batch_end(self, outputs, batch, batch_idx: int) -> None: + with torch.no_grad(): + self.model.logit_scale.clamp_(0, math.log(100)) + + def get_metrics(self, image_features, text_features, labels, logit_scale): + # 计算相似度,支持多个样本的情况(比如一个图片有多个caption) + # img2txt计算的时候要用到,因为一张图片可能对应多个文本。 + # txt2img计算的时候不需要(一般一个text只有一个对应图片) + metrics = {} + logits_per_image = (logit_scale * image_features @ text_features.t()).detach().cpu() + logits_per_text = logits_per_image.t().detach().cpu() + + logits = {"image_to_text": logits_per_image, "text_to_image": logits_per_text} + + label2idx = {} # 计算label到idx的映射。 + repeat_id = [] + for i, label in enumerate(labels): + if label not in label2idx: + label2idx[label] = [i] + else: + # 表示该index的标签出现过,记录这个index,后续算txt2img分数的时候,这些index的权值要降低。 + label2idx[label].append(i) + repeat_id.append(i) + + ground_truth = [label2idx[label] for label in labels] + + for name, logit in logits.items(): + if name == 'text_to_image': + logit[:, repeat_id] -= 1e8 # 这部分的分数要降低。(重复出现的图片,直接忽略) + r_stat = {1: [], 5: [], 10: []} + # r1_stat, r5_stat, r10_stat = [], [], [] + # index of the largest element to the smallest + ranking = torch.argsort(logit, descending=True) + for i, each_query in enumerate(ranking[:, :10]): + for j, q in enumerate(each_query): + found = False + if q in ground_truth[i]: + for k, v in r_stat.items(): + if j < k: + found = True + v.append(1) + if found: + break + for k, v in r_stat.items(): + metrics[f'{name}_R@{k}'] = sum(v)/len(logit) + return metrics + + def validation_step(self, batch, batch_idx): + image, text, label = batch + image_features, text_features, logit_scale = self(image, text) + return image_features, text_features, logit_scale, text['input_ids'].shape[0], label + + def validation_epoch_end(self, val_outputs): + all_image_features = [] + all_text_features = [] + all_labels = [] + sample_size = 0 + for o in val_outputs: + all_image_features.append(o[0]) + all_text_features.append(o[1]) + sample_size += o[3] + all_labels += o[4] + if len(all_image_features) == 0 or len(all_text_features) == 0: + return + all_image_features = torch.cat(all_image_features) + all_text_features = torch.cat(all_text_features) + logit_scale = val_outputs[0][2].mean() + logits_per_image = logit_scale * all_image_features @ all_text_features.t() + logits_per_text = logits_per_image.t() + + labels = torch.arange(sample_size, device=self.device).long() + total_loss = (F.cross_entropy(logits_per_image, labels) + + F.cross_entropy(logits_per_text, labels)) / 2 + + val_metrics = self.get_metrics( + image_features=all_image_features, + text_features=all_text_features, + logit_scale=logit_scale, + labels=all_labels) + loss = total_loss / sample_size + self.log('val_loss', loss, sync_dist=False) + for k, v in val_metrics.items(): + self.log(f'val_{k}', v, sync_dist=False) + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + def on_save_checkpoint(self, checkpoint) -> None: + # 保存的时候把权重按huggingface的形式保存出来 + if self.global_rank == 0: + dir_path = os.path.join( + self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}') + if not os.path.exists(dir_path): + os.mkdir(dir_path) + self.model.save_pretrained(dir_path) + self.processor.save_pretrained(dir_path) + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = add_data_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = TaiyiCLIP.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + trainer = Trainer.from_argparse_args(args, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + model = TaiyiCLIP(args) + processor = model.processor + collate_fn = Collator(args, processor) + datasets = load_data(args, global_rank=trainer.global_rank) + + # 加载单个验证集:!!!验证代码有效性临时这样干的,验证完有效性会删除 + from fengshen.examples.pretrain_taiyi_clip.flickr_datasets import flickr30k_CNA + img_root = '/shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images' + text_annot_path = '/shared_space/ccnl/mm_data/Flickr30k-CNA/test/flickr30k_cn_test.txt' + + datasets[args.val_datasets_field] = flickr30k_CNA(img_root, text_annot_path, collate_fn) + + datamoule = UniversalDataModule( + tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets) + + trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/examples/pretrain_taiyi_clip/test.py b/fengshen/examples/pretrain_taiyi_clip/test.py new file mode 100644 index 0000000000000000000000000000000000000000..c5927a8688618678c8838162bf0c42fac6067e19 --- /dev/null +++ b/fengshen/examples/pretrain_taiyi_clip/test.py @@ -0,0 +1,36 @@ +from pytorch_lightning import ( + Trainer, +) +from fengshen.models.model_utils import ( + add_module_args, +) +import argparse +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from fengshen.examples.pretrain_taiyi_clip.pretrain import ( + TaiyiCLIP, + Collator, +) +from fengshen.data.fs_datasets import load_dataset +from torch.utils.data import DataLoader + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = TaiyiCLIP.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + checkpoint_callback = UniversalCheckpoint(args) + trainer = Trainer.from_argparse_args(args, callbacks=[ + checkpoint_callback + ]) + + model = TaiyiCLIP(args) + processor = model.processor + collate_fn = Collator(processor) + datasets = load_dataset(args.datasets_name) + dataloader = DataLoader(datasets[args.test_datasets_field], + batch_size=args.test_batchsize, num_workers=2, collate_fn=collate_fn) + trainer.validate(model, dataloaders=dataloader, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/examples/pretrain_taiyi_clip/test.sh b/fengshen/examples/pretrain_taiyi_clip/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..729fa870407ec42b5cd48872c6acb9f5a4c8bf4f --- /dev/null +++ b/fengshen/examples/pretrain_taiyi_clip/test.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH --job-name=finetune_taiyi # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=8 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:8 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen + +NNODES=1 +GPUS_PER_NODE=1 + +MICRO_BATCH_SIZE=64 + +DATA_ARGS="\ + --test_batchsize $MICRO_BATCH_SIZE \ + --datasets_name flickr30k-CNA \ + " + +MODEL_ARGS="\ + --model_path /cognitive_comp/gaoxinyu/github/Fengshenbang-LM/fengshen/workspace/taiyi-clip-huge-v2/hf_out_0_661 \ + " + +TRAINER_ARGS="\ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy ddp \ + --log_every_n_steps 0 \ + --default_root_dir . \ + --precision 32 \ + " +# num_sanity_val_steps, limit_val_batches 通过这俩参数把validation关了 + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $TRAINER_ARGS \ + " + +CUDA_VISIBLE_DEVICES=0 python3 test.py $options +#srun -N $NNODES --gres=gpu:$GPUS_PER_NODE --ntasks-per-node=$GPUS_PER_NODE --cpus-per-task=20 python3 pretrain.py $options diff --git a/fengshen/examples/qa_t5/README.md b/fengshen/examples/qa_t5/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fffd0ac176970683240127ce9f7b29c0f0e0ea97 --- /dev/null +++ b/fengshen/examples/qa_t5/README.md @@ -0,0 +1,98 @@ +# 燃灯系列-T5问答模型微调 +## 简介 Brief Introduction + Here are codes for finetuning Randeng-T5-QA-Chinese. The model was pretrained on the Wudao 180G corpus, and finetuned on Chinese SQuAD and CMRC2018 dataset. It can produce a fluent and accurate answer given a passage and question. + +这是中文的生成式问答模型[Randeng-T5-QA-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-T5-784M-QA-Chinese)的微调代码。它基于T5-Large结构,使用悟道180G语料在[封神框架](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen)进行预训练,在ChineseSQuAD和CMRC2018两个阅读理解数据集上进行微调。输入一篇文章和一个问题,可以生成准确流畅的回答。 + +## 模型类别 Model Taxonomy + +| 需求 Demand | 任务 Task | 系列 Series | 模型 Model | 参数 Parameter | 额外 Extra | +| :----: | :----: | :----: | :----: | :----: | :----: | +| 通用 General | 自然语言转换 NLT | 燃灯 Randeng | T5 | 784M | 中文生成式问答 -Chinese Generative Qustion Answering | + +模型架构 +| 配置 | 参数 | +| ---- | ---- | +| encoder layers | 12 | +| encoder_attention_heads | 16 | +| encoder_ffn_dim | 2816 | +| decoder layers | 24 | +| decoder_attention_heads| 16 | +| decoder_ffn_dim | 2816 | +| max_encode_length | 1024 | + +## 模型表现 Performance + + CMRC 2018的测试集上的效果(原始任务是一个起始和结束预测问题,这里作为一个生成回答的问题) + + | model | Contain Answer Rate| RougeL | BLEU-4 |F1 | EM | + |-------|----|----|--------------------|--------|--------| + | Ours | 76.0 | 82.7 |61.1|77.9 |57.1| + + + Our model enjoys a high level of generation quality and accuracy, with 76% of generated answers containing the ground truth. The high RougeL and BLEU-4 reveal the overlap between generated results and ground truth. Our model has a lower EM because it generates complete sentences while golden answers are segmentations of sentences. + + 我们的模型有着极高的生成质量和准确率,76%的回答包含了正确答案(Contain Answer Rate)。RougeL和BLEU-4反映了模型预测结果和标准答案重合的程度。我们的模型EM值较低,因为生成的大部分为完整的句子,而标准答案通常是句子片段。 + + +## 模型 + +T5-Large: [Randeng-T5-784M-QA-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-T5-784M-QA-Chinese) + +文件: + - qa_dataset.py 数据集的处理,包含dataset和dataloader + - finetune_t5_cmrc.py 模型微调核心代码 + - run_finetune.sh, 微调脚本(未安装deepspeed的话strategy参数改为ddp) + - run_predict2.sh 预测脚本 + +## 使用 Usage + +```python +import numpy as np +from transformers import T5Tokenizer,MT5ForConditionalGeneration + +pretrain_path = 'IDEA-CCNL/Randeng-T5-784M-QA-Chinese' +tokenizer=T5Tokenizer.from_pretrained(pretrain_path) +model=MT5ForConditionalGeneration.from_pretrained(pretrain_path) + +sample={"context":"在柏林,胡格诺派教徒创建了两个新的社区:多罗西恩斯塔特和弗里德里希斯塔特。到1700年,这个城市五分之一的人口讲法语。柏林胡格诺派在他们的教堂服务中保留了将近一个世纪的法语。他们最终决定改用德语,以抗议1806-1807年拿破仑占领普鲁士。他们的许多后代都有显赫的地位。成立了几个教会,如弗雷德里夏(丹麦)、柏林、斯德哥尔摩、汉堡、法兰克福、赫尔辛基和埃姆登的教会。","question":"除了多罗西恩斯塔特,柏林还有哪个新的社区?","idx":1} +plain_text='question:'+sample['question']+'knowledge:'+sample['context'][:self.max_knowledge_length] + +res_prefix=tokenizer.encode('answer'+'',add_special_token=False) +l_rp=len(res_prefix) + +tokenized=tokenizer.encode(plain_text,add_special_tokens=False,truncation=True,max_length=self.max_seq_length-2-l_rp) + +tokenized+=res_prefix + +# Generate answer +pred_ids = model.generate(input_ids=tokenized,max_new_token=self.max_target_length,do_sample=True,top_p=0.9) +tokenizer.batch_decode(pred_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] +``` + +## 引用 Citation +如果您在您的工作中使用了我们的模型,可以引用我们的[论文](https://arxiv.org/abs/2210.08590): + +If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2210.08590): + +```text +@article{fengshenbang, + author = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang}, + title = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence}, + journal = {CoRR}, + volume = {abs/2209.02970}, + year = {2022} +} +``` + +You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/): + +欢迎引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/): +```text +@misc{Fengshenbang-LM, + title={Fengshenbang-LM}, + author={IDEA-CCNL}, + year={2021}, + howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}}, +} +``` \ No newline at end of file diff --git a/fengshen/examples/qa_t5/example_data.json b/fengshen/examples/qa_t5/example_data.json new file mode 100644 index 0000000000000000000000000000000000000000..1b9c12d9f4eb60a10cb316504d8752097b3eacd7 --- /dev/null +++ b/fengshen/examples/qa_t5/example_data.json @@ -0,0 +1,5 @@ +{"context": "郑思肖,原名郑之因,祖籍连江(今福建福州连江县),字忆翁,号所南,又号三外野人,宋末元初画家、诗人。南宋亡国后,孤身隐居苏州,终身未娶。郑思肖生于南宋理宗淳祐元年(1241年)。祖父郑咸曾任枝江县主簿。父亲郑起,字叔起,号菊山,从事教书生涯。母亲楼氏。南宋末年,通过科举,考中秀才,为理宗时的太学上舍,应博学宏词科。咸淳三年(1267年) 元人南下攻宋,咸淳九年(1273年)襄阳失守。郑思肖献策抵抗,因“辞切直,忤当路”,未被采纳。祥兴二年(1279年)南宋灭亡,郑思肖隐居吴中(今江苏苏州),改名思肖(因宋朝皇帝姓赵),改字忆翁;为了寄托爱国情怀,郑思肖坐卧必向南,并自号“所南”;所居之处命名为“本穴世界”(把“本”字中的“十”置于“穴”字中,便是“大宋”,以词寓意其乃大宋遗民,不忘故国。元仁宗延祐五年(1318年),郑思肖于苏州觉报寺内逝世。郑思肖专工画兰,特征为花和叶萧疏,画兰不画土地和根,寓意宋朝沦亡。其存世作品有《国香图卷》、《墨兰图卷》、《墨兰图》等。其中《墨兰图卷》藏于日本大阪市立美术馆,《墨兰图》藏于美国耶鲁大学美术馆。郑思肖在《画菊》中托物言志,以菊花自比,隐含了诗人的人生遭际和理想追求。这首诗的意思是:菊花不与百花为丛,独立却意趣未穷,宁愿在枝头上枯死、遗留芬芳,也不向元朝(北风)投降。常用于表达高尚的民族气节。2017年6月中国国民党主席洪秀柱在海峡论坛的讲话中引用了这首诗的后两句,说“现实轨迹总未必是尽如人意,但逆境才能锤炼出钢铁的意志”。郑思肖的作品有《郑所南先生文集》、《一百二十图诗集》、《心史》等。其中《心史》是郑思肖在南宋灭亡之后写下的一部诗文总集,分上下两卷。史学家陈寅恪很推崇郑思肖,在《柳如是别传》中曾写“所南心史,固非吴井之藏;孙盛阳秋,同是辽东旧本。”郭沫若在抗日战争期间写的《国画中的民族意识》中,称赞郑思肖是“民族意识浓烈的人”。", "answer": ["为了寄托爱国情怀"], "question": "祥兴二年(1279年)以后,郑思肖为何坐卧必向南,并自号“所南”", "idx": 10045, "ans_span": [[267, 275]]} +{"context": "网络信标(web beacon)也称网页臭虫(web bug),是可以暗藏在任何网页元素或邮件内的1像素大小的透明GIF或PNG图片,常用来收集目标电脑用户的上网习惯等数据,并将这些数据写入Cookie。网络信标和垃圾邮件中较为常用。网络臭虫(Web bug)也称为网络信标(Web beacon),是一个放置在网页或电子邮件上的文件对象,用于监测用户的行为。它不像Cookie那样可以被浏览器用户接受或拒绝,网络臭虫只以图形交换格式(GIF)或其他文件对象的形式出现。它通常只能被检测,如果用户查看网页的源版本会发现一个从不同的Web服务器而不是从网页的其他部分负载的标签。虽然互联网隐私倡导者反对使用网络臭虫,但是他们大部分承认网络臭虫有积极用途,例如跟踪侵犯版权的网站。根据Richard M.Smith,网络臭虫(Web bug)可以收集以下资料:网络臭虫(Web bug)经常被垃圾邮件发送者用来验证电子邮件地址。当收件人打开一封有网络臭虫的电子邮件时,返回给发件人的信息就会显示邮件已被打开,这样就可以确认电子邮件地址是有效的。信标API(Beacon API)是一种较新的Web技术,它不需要使用不可见图像或类似手段就能达到相同的目的。,它还是一个万维网联盟的候选建议。其旨在使Web开发人员能在用户离开页面时将信息(如分析或诊断数据)发回Web服务器,以跟踪用户的活动。使用Web信标API能够不干扰或影响网站导航的完成此种跟踪,并且对最终用户不可见。信标API已于2014年被相继引入到Mozilla Firefox和Google Chrome网页浏览器。", "answer": ["图形交换格式(GIF)或其他文件对象的形式出现"], "question": "网络臭虫以什么样的形式出现在网页中?", "idx": 9359, "ans_span": [[211, 234]]} +{"context": "10号球,是一种新兴的花式撞球运动项目,其基本玩法与9号球类似,但多了一颗10号子球,而且击球前必须先指定球、指定袋,所以困难度提高很多,颇具发展潜力。参赛双方比球决定第一局谁先开球,此后各局采轮流开球制。排球时10颗子球紧密排成三角形,1号球在前端,并位于脚点上,10号球在三角形中间,其他各球位置不限。开球前应将母球放置于发球线后,并以球杆撞击母球使其先碰到1号球。开球时若无子球进袋,至少应有4颗子球触碰台边,否则即为犯规。开完球的第一杆可以做push out,要使用必须事先声明。所谓push out是指可以把母球推到任何一个位置,不受先碰到号码最小的球这条规则限制,你想把球打进也可以(但是10号打进要捡起来放回脚点)。若母球落袋一样算犯规,此时对手可以选择打或是不打。若开完球之后,无法打到目标球,选手通常会作push out,然后双方进入防守战。击球前必须先表明要将那一球打进那一袋,即所谓「指定球、指定袋」。若是要进的球很明确,则可省略指定球;但无论何种状况,均不可省略指定袋。至于子球碰撞颗星或与其他球的碰撞方式,均不必说明。每次击球时,母球必须先碰撞台面上号码最小的球,才算合法击球。当击球者将正确的球打进正确的袋后,始得以继续击球;否则换对手击球。若是指定球进错袋,或者进错球,也要换对手击球;但此时对手可以选择打或不打。比赛中先将10号球打进袋者赢得该局击球者若发生以下情形,将换由对手发自由球,亦即可将母球放置于台面上的任何位置再行击球。", "answer": ["击球前必须先表明要将那一球打进那一袋"], "question": "指定球、指定袋又是指什么?", "idx": 7246, "ans_span": [[380, 398]]} +{"context": "分析机是由英国数学家查尔斯·巴贝奇设计的一种机械式通用计算机。从1837年首次提出这种机器的设计,一直到他去世的1871年,由于种种原因,这种机器并没有被真正的制造出来。但它本身的设计逻辑却十分先进,是大约100年后电子通用计算机的先驱。查尔斯·巴贝奇最初尝试的所谓差分机,可以通过求解差分来计算对数表和三角函数表,然后能近似计算多项式。由于巴贝奇与他的首席工程师起了争执,英国政府就撤回了这项项目的资金,差分机也因此没能完成。在这期间,巴贝奇意识到建造一种更加通用的机器(即所谓的分析机)是可行的,于是便于1833年开始了分析机的设计。分析机由蒸汽机驱动,大约有30米长、10米宽。它的输入由程序和数据组成,并使用打孔卡输入,这种输入方法被当时的织布机广泛采用。分析机通过一台打印机、一个弯曲的绘图仪和一个铃铛输出,也可以在纸上打孔以便日后读取。分析机采取普通的十进制定点计数法。它的“记忆体”大约可以存储1000个40位的十进制数(每个数约16.2kB)。有一个算术逻辑单元可以进行四则运算、比较和求平方根操作。刚开始研制的时候,分析机的外观被普遍认为和差分机相似 。1858年的图纸呈现了一个有规律的网格布局。与现代计算机的中央处理器(CPU)类似,其算术逻辑单元使用的微程序存储在插在被称为“桶”的滚筒上的支柱中,这为用户指定更加复杂的运算提供了便利。分析机使用的编程语言与今天的汇编语言类似,支持循环语句和条件分支,因此这门语言被认为是图灵完备的。分析机采用三种不同的打孔卡和读卡器来区分算术运算、数字常量和存储的指令,以此实现了数字在存储器和运算单元之间的加载和存储操作。巴比奇在1837至1840年间写下了24份程序,并在之后又写了一份。这些程序可以计算多项式、迭代公式、高斯消去法和伯努利数。", "answer": ["1837年"], "question": "分析机设计首次提出是什么时候?", "idx": 3905, "ans_span": [[32, 37]]} +{"context": "金文秀(Kim Moon-Soo,),已退休韩国男子羽毛球运动员。金文秀曾经两度赢得世界锦标赛男子双打冠军。他也曾赢得1面奥运会男子双打金牌,及3次全英公开赛男子双打冠军。这些荣誉均是与同胞朴柱奉共同获得,那也是他羽毛球生涯中的主要搭档。2002年,他被选入羽毛球名人堂。金文秀曾于1992年夏季奥林匹克运动会羽毛球比赛代表韩国出赛。他与朴柱奉搭档参加男子双打项目,在决赛中以15-11, 15-7击败来自印尼的洪忠中、郭宏源组合而夺得金牌。在伦敦奥运会的女双赛事发生「消极比赛事件」,事后,韩国羽协重罚涉事的教练和球员。身为女双教练的金文秀被韩国羽协剥夺教练资格,此后不能在韩国国内俱乐部队执教。", "answer": ["朴柱奉"], "question": "金文秀1992年和谁一起参加了男子双打项目?", "idx": 8479, "ans_span": [[95, 98]]} \ No newline at end of file diff --git a/fengshen/examples/qa_t5/finetune_t5_cmrc.py b/fengshen/examples/qa_t5/finetune_t5_cmrc.py new file mode 100644 index 0000000000000000000000000000000000000000..c8f2c30254f7b907921a83a07dba802279838ac9 --- /dev/null +++ b/fengshen/examples/qa_t5/finetune_t5_cmrc.py @@ -0,0 +1,450 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : finetune_t5_cmrc.py +@Time : 2022/10/28 19:57 +@Author : He Junqing +@Version : 1.0 +@Contact : hejunqing@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' +# here put the import lib + +import pytorch_lightning as pl +import os +import sys +import time +import torch +import argparse +from collections import Counter +from fengshen.utils.utils import chinese_char_tokenize +from fengshen.data.universal_datamodule import UniversalDataModule +from pytorch_lightning import Trainer, loggers +from pytorch_lightning.callbacks import LearningRateMonitor +from transformers import MT5ForConditionalGeneration, T5Tokenizer, MT5Config +from torchmetrics.text.rouge import ROUGEScore +from nltk.translate.bleu_score import corpus_bleu + +torch.cuda.empty_cache() + + +class QAFinetuneModel(pl.LightningModule): + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group("BaseModel") + parser.add_argument("--prediction_res_path", default=None, type=str) + parser.add_argument( + "--decode_strategy", + default="greedy", + choices=["beamsearch", "sampling", "greedy"], + ) + return parent_args + + def __init__(self, args): + super().__init__() + self.save_hyperparameters(args) + self.formator = args.formator + self.max_target_length = args.max_target_length + self.decode_strategy = args.decode_strategy + self.rouge_metric = ROUGEScore( + rouge_keys=("rougeL", "rouge1", "rouge2"), normalizer=lambda x: x + ) + self.loss_func = torch.nn.CrossEntropyLoss(reduction="none") + + self.model = MT5ForConditionalGeneration.from_pretrained( + args.pretrained_model_path + ) + print("using MT5 model") + + if args.tokenizer_type == "t5_tokenizer": + self.tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path) + print("vocab_size:", len(self.tokenizer)) + # self.tokenizer.add_special_tokens(special_token_dict) + # print('add special tokens to tokenizer,vocab size:',len(self.tokenizer)) + else: + print("now only the t5_tokenizer is supported") + self.bleu_val = [] + + def setup(self, stage=None) -> None: + + if stage == "fit": + train_loader = ( + self.trainer._data_connector._train_dataloader_source.dataloader() + ) + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches * float( + self.trainer.max_epochs + ) + self.total_steps = ( + len(train_loader.dataset) * self.trainer.max_epochs // tb_size + ) // ab_size + else: + self.total_steps = ( + self.trainer.max_steps // self.trainer.accumulate_grad_batches + ) + + print("Total steps: {}".format(self.total_steps)) + # return super().setup(stage) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + + return configure_optimizers(self) + + def on_save_checkpoint(self, checkpoint) -> None: + # Save the current loop info in the mid of epoch + # if you lightning <= 1.6.0 uncomment the line below + # checkpoint['loops'] = self.trainer.checkpoint_connector._get_loops_state_dict() + if ( + self.trainer.global_rank == 0 + and self.trainer.global_step % self.hparams.every_n_train_steps == 0 + ): + self.model.save_pretrained( + os.path.join( + self.trainer.checkpoint_callback.dirpath, + "hf_pretrained_epoch{}_step{}".format( + self.trainer.current_epoch, self.trainer.global_step + ), + ) + ) + + def on_load_checkpoint(self, checkpoint) -> None: + global_step_offset = checkpoint["global_step"] + if "global_samples" in checkpoint: + self.consumed_samples = checkpoint["global_samples"] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + def training_step(self, batch, batch_idx): # todo: change + if self.formator == "t5style": + output = self.model( + input_ids=batch["input_ids"], + labels=batch["labels"], + decoder_input_ids=batch["decoder_input_ids"], + ) + else: + output = self.model( + input_ids=batch["input_ids"], + input_token_type=batch["token_types"], + labels=batch["labels"], + decoder_input_ids=batch["decoder_input_ids"], + ) + # print(output.logits) + acc = self.comput_metrix(output.logits, batch["labels"]) + grad = get_gradient_norm(self.model) + self.log("train_loss", output.loss, sync_dist=True) + self.log("train_acc", acc, sync_dist=True) + self.log("train_grad", grad, sync_dist=True) + return output.loss + + def validation_step(self, batch, batch_idx): + output = self.model( + input_ids=batch["input_ids"], + labels=batch["labels"], + ) + pred_ids = self.model.generate( + input_ids=batch["input_ids"], max_new_tokens=self.max_target_length + ) + + acc = self.comput_metrix(output.logits, batch["labels"]) + # print(output.logits.shape) + self.log("val_loss", output.loss, sync_dist=True) + self.log("val_acc", acc, sync_dist=True) + batch_labels = torch.where( + batch["labels"] != -100, batch["labels"], self.tokenizer.pad_token_id + ) + + ppl = torch.exp(output.loss) + self.log("val_ppl", ppl, sync_dist=True) + pred_tokens = self.tokenizer.batch_decode( + pred_ids, cleanup_tokenization_space=True, skip_special_tokens=True + ) + label_tokens = self.tokenizer.batch_decode( + batch_labels, cleanup_tokenization_space=True, skip_special_tokens=True + ) + pred_sentences = list(map(remove_pad, pred_tokens)) + # print(label_tokens) + self.bleu_val.append(compute_bleu(pred_sentences, [[t] for t in label_tokens])) + candidate = [ + chinese_char_tokenize(p).lstrip("") for p in pred_tokens + ] + target = [ + generate_sentence(chinese_char_tokenize(sent)).lstrip("") + for sent in label_tokens + ] + self.rouge_metric.update(preds=candidate, target=target) + f1 = compute_f1(candidate, label_tokens) + self.log("val_f1", f1, sync_dist=True) + + def on_validation_epoch_end(self) -> None: + n = len(self.bleu_val) + avg_bleu = float(sum(self.bleu_val)) / n + print("bleu:", avg_bleu) + self.log("val_bleu", avg_bleu) + self.bleu_val = [] + rouge_dict = self.rouge_metric.compute() + # reset the metric after once validation + self.rouge_metric.reset() + for k, v in rouge_dict.items(): + self.log("val_{}".format(k), v, sync_dist=True) + if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0: + print("rouge:\n", rouge_dict) + return + + def predict_step(self, batch, batch_idx): + num_beams = 1 + do_sample = False + top_p = None + if self.decode_strategy == "beamsearch": + num_beams = 10 + elif self.decode_strategy == "sampling": + num_beams = 4 + top_p = 0.9 + do_sample = True + + prediction_dic = self.model.generate( + input_ids=batch["input_ids"], + max_new_tokens=self.max_target_length, + num_beams=num_beams, + do_sample=do_sample, + top_p=top_p, + no_repeat_ngram_size=3, + return_dict_in_generate=True, + output_scores=True, + ) + output = self.model( + input_ids=batch["input_ids"], + labels=batch["labels"], + ) + prediction_ids = prediction_dic["sequences"] + loss_tensor = self.loss_func(output.logits.transpose(1, 2), batch["labels"]) + indexes = torch.where(batch["labels"] == self.tokenizer.eos_token_id)[1] + loss = torch.sum(loss_tensor, dim=1) / indexes + return { + "input_ids": batch["input_ids"], + "predict_ids": prediction_ids, + "labels": batch["labels"], + "decoder_inputs": batch["decoder_input_ids"], + "loss": loss, + } + + def save_preditions(self, result, args): + with open(args.prediction_res_path, "w", encoding="utf8") as fw: + preditions = [] + labels = [] + for batch in result: + print(batch.keys()) + batch_labels = torch.where( + batch["labels"] != -100, + batch["labels"], + self.tokenizer.pad_token_id, + ) + for i in range(len(batch["input_ids"])): + context = self.tokenizer.decode( + batch["input_ids"][i], + skip_special_tokens=True, + cleanup_tokenization_space=True, + ) + pred = self.tokenizer.decode( + batch["predict_ids"][i], + cleanup_tokenization_space=True, + skip_special_tokens=True, + ) + target = generate_sentence( + self.tokenizer.batch_decode( + batch_labels[i], cleanup_tokenization_space=True + ) + ) + pred = pred.lstrip("") + target = target.lstrip("") + self.rouge_metric.update( + preds=chinese_char_tokenize(pred), + target=chinese_char_tokenize(target), + ) + preditions.append(list(pred)) + labels.append([list(target)]) + fw.write("context:" + "".join(context) + "\n") + fw.write("pred:" + pred + "\n") + fw.write("target" + target + "\n") + fw.write("loss:{:.6f}\n".format(batch["loss"][i].item())) + fw.write("\n") + bleu = compute_bleu(preditions, labels) + fw.write("bleu:{}".format(bleu)) + print("finish prediction, saved in {}".format(args.prediction_res_path)) + return preditions, labels + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_true = labels.float() + pad_num = torch.sum(torch.eq(labels, -100)) + corr = torch.eq(y_pred, y_true) + acc = (torch.sum(corr.float()) - pad_num) / ( + y_true.view(size=(-1,)).shape[0] - pad_num + ) + return acc + + +class PredictDataModule(UniversalDataModule): + + def predict_dataloader(self): + return self.test_dataloader() + + +def main(): + + total_parser = argparse.ArgumentParser("Finetune Dialogue model.") + total_parser.add_argument("--do_eval_only", action="store_true", default=False) + total_parser.add_argument("--pretrained_model_path", default=None, type=str) + total_parser.add_argument("--new_vocab_path", default=None, type=str) + total_parser.add_argument( + "--tokenizer_type", + default="t5_tokenizer", + choices=["t5_tokenizer", "bert_tokenizer"], + ) + total_parser.add_argument("--train_split_size", default=0.995, type=int) + total_parser.add_argument("--preprocessing_num_workers", default="10", type=int) + total_parser.add_argument("--ckpt_path", default=None, type=str) + total_parser.add_argument("--use_cache", default=False, type=bool) + total_parser.add_argument( + "--formator", default="dialog", choices=["dialog", "ccqa", "t5style"] + ) + + sys.path.append("../../../") + + from fengshen.utils.universal_checkpoint import UniversalCheckpoint + from qa_dataset import T5StyleDataset, TextGenCollator + + total_parser = T5StyleDataset.add_data_specific_args(total_parser) + total_parser = UniversalDataModule.add_data_specific_args( + total_parser + ) # TaskDataModel + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = UniversalCheckpoint.add_argparse_args(total_parser) + total_parser = QAFinetuneModel.add_model_specific_args( + total_parser + ) # todo: check names + + args = total_parser.parse_args() + print("Argument parse success.") + print("superviseT5DataModel load start {}".format(get_time_str())) + + config = MT5Config.from_pretrained(args.pretrained_model_path) + collate_fn = TextGenCollator( + config=config, + pad_token_id=config.pad_token_id, + decoder_start_token_id=config.decoder_start_token_id, + formator=args.formator) + if not args.do_eval_only: + datasets = {'train': T5StyleDataset(args.train_file, args, load_data_type=0, data="train"), + 'validation': T5StyleDataset(args.val_file, args, load_data_type=0, data="dev")} + + model = QAFinetuneModel(args) + print("superviseT5DataModel load end {}".format(get_time_str())) + + data_model = UniversalDataModule( + tokenizer=None, args=args, collate_fn=collate_fn, datasets=datasets + ) + print('data loaded') + checkpoint_callback = UniversalCheckpoint(args) + lr_monitor = LearningRateMonitor(logging_interval="step") + logger = loggers.TensorBoardLogger( + save_dir=os.path.join(args.default_root_dir, "logs/") # TOCHANGE + ) + trainer = Trainer.from_argparse_args( + args, logger=logger, callbacks=[checkpoint_callback, lr_monitor] + ) + trainer.fit(model, data_model) + else: + datasets = {'test': T5StyleDataset(args.test_file, args, load_data_type=0, data="test")} + + data_model = PredictDataModule( + tokenizer=None, args=args, collate_fn=collate_fn, datasets=datasets + ) + + tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path) + model = QAFinetuneModel(args=args) + trainer = Trainer.from_argparse_args(args) + result = trainer.predict(model, data_model, ckpt_path=args.ckpt_path) + predictions, labels = model.save_preditions(result, args) + sample = result[0] # first_batch + batch_labels = torch.where( + sample["labels"] != -100, sample["labels"], model.tokenizer.pad_token_id + ) + for i in range(4): + print(tokenizer.batch_decode(sample["input_ids"][i])) + print(tokenizer.batch_decode(sample["predict_ids"][i])) + print(tokenizer.batch_decode(batch_labels[i])) + + +def compute_f1(cand, ref): + f1_score = [] + for p, t in zip(cand, ref): + p_tokens = p.split() + t_tokens = t.split() + common = Counter() & Counter(t.split()) + num_same = sum(common.values()) + if len(t_tokens) == 0 or len(p_tokens) == 0: + f1 = int(p == t) + elif num_same == 0: + f1 = 0 + else: + precision = 1.0 * num_same / len(p_tokens) + recall = 1.0 * num_same / len(t_tokens) + f1 = (2 * precision * recall) / (precision + recall + 1e-8) + f1_score.append(f1) + f1 = sum(f1_score) / float(len(cand)) + return f1 + + +def generate_sentence(raw_list): + words = [] + i = 0 + while i < len(raw_list) and raw_list[i] != "": + words.append(raw_list[i]) + i += 1 + return "".join(words) + + +def remove_pad(raw_text, ref=False): + if ref: + return [raw_text.lstrip("")] + else: + return raw_text.lstrip("") + + +def compute_bleu(preditions, labels): + + score_nltk = corpus_bleu(labels, preditions) + return score_nltk + + +def get_gradient_norm(model): + total_norm = 0 + parameters = [ + p for p in model.parameters() if p.grad is not None and p.requires_grad + ] + for p in parameters: + param_norm = p.grad.detach().data.norm(2) + total_norm += param_norm.item() ** 2 + total_norm = total_norm**0.5 + return total_norm + + +def get_time_str(): + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/qa_t5/qa_dataset.py b/fengshen/examples/qa_t5/qa_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d1d395fa798f5b3b9c3eaf33cd5dcca1ff67722a --- /dev/null +++ b/fengshen/examples/qa_t5/qa_dataset.py @@ -0,0 +1,187 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : qa_dataset.py +@Time : 2022/10/28 19:57 +@Author : He Junqing +@Version : 1.0 +@Contact : hejunqing@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' +# here put the import lib + +from dataclasses import dataclass +import numpy as np +import torch +from torch.nn.utils.rnn import pad_sequence + +from fengshen.data.t5_dataloader.t5_gen_datasets import DialogDataset + + +class T5StyleDataset(DialogDataset): + + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group("Dataset") + parser.add_argument("--max_seq_length", default=512, type=int) + parser.add_argument("--max_knowledge_length", default=128, type=int) + parser.add_argument("--max_target_length", default=128, type=int) + return parent_args + + def regular_tokenize(self, sample): + """ + sample.keys:question:str,context:stc, answer:[],idx:int,ans_span:[] + """ + plain_text = ( + "question:" + + sample["question"] + + "knowledge:" + + sample["context"][: self.max_knowledge_length] + ) + l_text = len(plain_text) + + ctx_len = self.max_seq_length - l_text - 1 + if ctx_len > 0 and "history" in sample: + context = "[SEP]".join(sample["history"]) + plain_text += "context:" + context + + res_prefix = self.tokenizer.encode("answer:", add_special_tokens=False) + # res_prefix.tolist() + l_rp = len(res_prefix) + + tokenized = self.tokenizer.encode( + plain_text, + add_special_tokens=False, + truncation=True, + max_length=self.max_seq_length - 2 - l_rp, + ) + # tokenized.tolist() + tokenized += res_prefix + # add maskid + mask_id = self.tokenizer.convert_tokens_to_ids("") + tokenized.append(mask_id) + tokenized.append(self.eos_token_id) + # print(tokenized) + + target_ids = self.tokenizer.encode( + "" + sample["answer"][0], + add_special_tokens=True, + truncation=True, + max_length=self.max_target_length, + ) + + # print(target_ids) + tokenized_sample = {} + tokenized_sample["input_ids"] = np.array(tokenized, dtype=np.int32) + tokenized_sample["attention_mask"] = np.ones(len(tokenized), dtype=np.int8) + tokenized_sample["labels"] = np.array(target_ids, dtype=np.int32) + tokenized_sample["idx"] = sample["idx"] + # print(tokenized_sample) + return tokenized_sample + + +@dataclass +class TextGenCollator: + ''' + ''' + config: None + pad_token_id: -100 + decoder_start_token_id: 0 + formator: str = 't5style' + + def setup(self): + pass + + def __call__(self, samples): + batch = { + k: [ + torch.tensor(samples[i][k], dtype=torch.int64) + for i in range(len(samples)) + ] + for k in ["input_ids", "attention_mask", "labels"] + } + batch["idx"] = torch.tensor([samples[i]["idx"] for i in range(len(samples))]) + + # print(batch) + for k, v in batch.items(): + if k != "labels" and k != "idx": + batch[k] = pad_sequence( + v, batch_first=True, padding_value=self.pad_token_id + ) + elif k == "labels": + batch[k] = pad_sequence(v, batch_first=True, padding_value=-100) + + batch["decoder_input_ids"] = torch.tensor( + self.shift_tokens_right( + batch["labels"], self.pad_token_id, self.decoder_start_token_id + ), + dtype=torch.long, + ) + return batch + + def shift_tokens_right( + self, input_ids: np.array, pad_token_id: int, decoder_start_token_id: int + ) -> np.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = np.zeros_like(input_ids) + shifted_input_ids[:, 1:] = input_ids[:, :-1] + shifted_input_ids[:, 0] = decoder_start_token_id + + shifted_input_ids = np.where( + shifted_input_ids == -100, pad_token_id, shifted_input_ids + ) + return shifted_input_ids + + +if __name__ == "__main__": + # test + import argparse + + total_parser = argparse.ArgumentParser("DATASET parser") + total_parser.add_argument( + "--tokenizer_type", + default="t5_tokenizer", + choices=["bert_tokenizer", "t5_tokenizer"], + ) + total_parser.add_argument("--preprocessing_num_workers", default="4", type=int) + total_parser.add_argument( + "--new_vocab_path", + default=None, + type=str, + ) + + total_parser.add_argument( + "--pretrained_model_path", + default="YOUR DOWNLOAD MODEL PATH", + ) + total_parser.add_argument("--train_split_size", default=0.995, type=int) + total_parser.add_argument( + "--formator", default="t5style", choices=["t5style", "squad", "dialog"] + ) + total_parser = TextGenCollator.add_data_specific_args(total_parser) + args = total_parser.parse_args() + args.train_data_path = "cmrc" + ds = T5StyleDataset("cmrc", args, "dev") + print(len(ds)) + for i in range(10): + print(ds[i]) + + dl = TextGenCollator(args) + for i in range(5): + for batch in dl.val_dataloader(): + print(batch) + print(batch["input_ids"]) + print(batch["no_answer"]) + print(batch["decoder_input_ids"]) + print(batch["labels"]) diff --git a/fengshen/examples/qa_t5/run_finetune.sh b/fengshen/examples/qa_t5/run_finetune.sh new file mode 100644 index 0000000000000000000000000000000000000000..4e8e1f4b0fe07a8d2807e44d55a1f22cb2ef6439 --- /dev/null +++ b/fengshen/examples/qa_t5/run_finetune.sh @@ -0,0 +1,109 @@ +#!/bin/bash +#SBATCH --job-name=finetune-cmrc +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:1 # number of gpus +#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o $YOUR_PROJECT_DIR/%x-%j.log +#SBATCH -e $YOUR_PROJECT_DIR/%x-%j.err + +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=8 + +ROOT_DIR=$YOUR_PROJECT_DIR +DOWNLOAD_MODEL_PATH=$YOUR_PROJECT_DIR/Randeng-T5-784M-QA-Chinese/ + + +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.randeng_t5_dialog_784M.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=$YOUR_HOME/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 10 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --save_ckpt_path $ROOT_DIR/ckpt \ + --save_top_k 5 \ + --every_n_train_steps 100\ + --monitor val_rougeL_fmeasure \ + --mode max \ + --save_last \ + --check_val_every_n_epoch 1 \ + --num_workers 4 \ + --dataloader_workers 4 \ + --replace_sampler_ddp False \ + --accumulate_grad_batches 2 \ + --formator t5style \ + --filename model-{epoch:02d}-{val_loss:.4f}-{val_rougeL_fmeasure:.3f} \ + --precision 16 \ +" + +TRAIN_DATA_PATH=$YOUR_TRAIN_FILE +DEV_DATA_PATH=$YOUR_DEV_FILE + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --train_file $TRAIN_DATA_PATH \ + --val_file $DEV_DATA_PATH \ + --max_seq_length 512 \ + --max_knowledge_length 425 \ + --max_target_length 128 +" + +MODEL_ARGS=" + --pretrained_model_path $DOWNLOAD_MODEL_PATH \ + --tokenizer_type t5_tokenizer \ + --learning_rate 1e-4 \ + --weight_decay 1e-2 \ + --warmup_ratio 0.1 \ + --sheduler_type polynomial \ + --min_learning_rate 1e-5 \ +" + +SCRIPTS_PATH=$YOUR_PROJECT_DIR/Fengshenbang-LM/fengshen/examples/qa_t5/finetune_t5_cmrc.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +# conda activate fs +# export CUDA_VISIBLE_DEVICES=5 +srun python $CMD diff --git a/fengshen/examples/qa_t5/run_predict.sh b/fengshen/examples/qa_t5/run_predict.sh new file mode 100644 index 0000000000000000000000000000000000000000..8b8470ed1136320b75ba6da51209b3c9af9c74d0 --- /dev/null +++ b/fengshen/examples/qa_t5/run_predict.sh @@ -0,0 +1,110 @@ +#!/bin/bash +#SBATCH --job-name=predict-cmrc +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:1 # number of gpus +#SBATCH --cpus-per-task=4 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH -o $YOUR_SLURM_LOG_PATH/%x-%j.log +#SBATCH -e $YOUR_SLURM_LOG_PATH/%x-%j.err + +# +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=8 + +ROOT_DIR=$YOUR_PROJECT_DIR +DOWNLOAD_MODEL_PATH=$YOUR_PROJECT_DIR/Randeng-T5-784M-QA-Chinese/ +#YOUR_MODEL_DIR + +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=1 + +config_json="$ROOT_DIR/ds_config.randeng_t5_dialog_784M.$SLURM_JOBID.json" +export MASTER_PORT=$[RANDOM%10000+30000] + +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=$YOUR_HOME/tmp/torch_extendsions +# strategy=ddp +strategy=deepspeed_stage_1 + +TRAINER_ARGS=" + --max_epochs 10 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy ${strategy} \ + --default_root_dir $ROOT_DIR \ + --save_ckpt_path $ROOT_DIR/ckpt \ + --save_top_k 5 \ + --every_n_train_steps 100\ + --monitor val_rougeL_fmeasure \ + --mode max \ + --save_last \ + --check_val_every_n_epoch 1 \ + --num_workers 4 \ + --dataloader_workers 4 \ + --replace_sampler_ddp False \ + --accumulate_grad_batches 2 \ + --formator t5style \ + --filename model-{epoch:02d}-{val_loss:.4f}-{val_rougeL_fmeasure:.3f} \ + --do_eval_only \ + --prediction_res_path $ROOT_DIR/predictions_sampling.txt \ + --decode_strategy sampling \ + --precision 16 \ +" + +TEST_FILE_PATH=$YOUR_DATA_FILE + +DATA_ARGS=" + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_file $TEST_FILE_PATH \ + --max_seq_length 512 \ + --max_knowledge_length 425 \ + --max_target_length 128 +" +MODEL_ARGS=" + --pretrained_model_path $DOWNLOAD_MODEL_PATH\ + --tokenizer_type t5_tokenizer \ + --learning_rate 1e-4 \ + --weight_decay 1e-2 \ + --warmup_ratio 0.1 \ + --sheduler_type polynomial \ + --min_learning_rate 1e-5 \ +" + +SCRIPTS_PATH=$YOUR_PROJECT_DIR/Fengshenbang-LM/fengshen/examples/qa_t5/finetune_t5_cmrc.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD +# conda activate fs +# export CUDA_VISIBLE_DEVICES=5 +srun python $CMD diff --git a/fengshen/examples/randeng_reasoning/README.md b/fengshen/examples/randeng_reasoning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b7ccc3df3d5c3fe50ebd52f1ddc8a822e13e6528 --- /dev/null +++ b/fengshen/examples/randeng_reasoning/README.md @@ -0,0 +1,161 @@ +# 燃灯系列-因果推理生成模型 + +- Huggingface: + - [Randeng-TransformerXL-5B-Deduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese) + - [Randeng-TransformerXL-5B-Abduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese) +- Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM/fengshen/examples/randeng_reasoning) +- Docs: [Fengshenbang-Docs](https://fengshenbang-doc.readthedocs.io/) +- Demo: [Reasoning Tree](https://idea.edu.cn/ccnl-act/reasoning/) + +## 简介 Brief Introduction + +基于Transformer-XL的中文因果推理生成模型和反绎推理生成模型。 + +Chinese deductive reasoning model and abductive reasoning model based on Transformer-XL. + +## 模型分类 Model Taxonomy + +| 需求 Demand | 任务 Task | 系列 Series | 模型 Model | 参数 Parameter | 额外 Extra | +| :----: | :----: | :----: | :----: | :----: | :----: | +| 通用 General | 自然语言生成 NLG | 燃灯 Randeng | TransformerXL | 5.0B | 中文-因果推理 Chinese-Reasoning | + +## 模型信息 Model Information + +**数据准备 Corpus Preparation** + +* 悟道语料库(280G版本) +* 因果语料库(2.3M个样本):基于悟道语料库(280G版本),通过关联词匹配、人工标注 + [GTSFactory](https://gtsfactory.com/)筛选、数据清洗等步骤获取的具有因果关系的句子对 + +* Wudao Corpus (with 280G samples) +* Wudao Causal Corpus (with 2.3 million samples): Based on the Wudao corpus (280G version), sentence pairs with causality were obtained through logic indicator matching, manual annotation + [GTSFactory](https://gtsfactory.com/), and data cleaning. + +**训练流程 Model Training** +1. 在悟道语料库(280G版本)上进行预训练 +2. 在1.5M因果语料上分别进行因果生成任务和反绎生成任务的训练 +3. 基于其余0.8M因果语料,[Randeng-TransformerXL-5B-Deduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese)、[Randeng-TransformerXL-5B-Abduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese)和[Erlangshen-Roberta-330M-Causal-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Roberta-330M-Causal-Chinese)进行Self-consistent闭环迭代训练 + * 两个生成模型基于核采样和贪心的方式进行因果推理和反绎推理,产生大量伪样本; + * Erlangshen-Roberta-330M-Causal-Chinese模型对伪样本句子对的因果关系进行打分,筛选供自身以及生成模型训练的样本 + +First, the Transformer-XL model was pre-trained on the Wudao Corpus (with 280G samples) and annotated similar-sentence pair dataset (same as [Randeng-TransformerXL-1.1B-Paraphrasing-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-1.1B-Paraphrasing-Chinese)). +Then, the model was trained on our causal corpus (about 1.5 million samples) for the deductive reasoning task. +At last, based on the remaining 0.8 million samples of the causal corpus, we conducted self-consistent learning on [Randeng-TransformerXL-5B-Deduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese) and [Randeng-TransformerXL-5B-Abduction-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese), cooperating with [Erlangshen-Roberta-330M-Causal-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Roberta-330M-Causal-Chinese). +Specifically, two generative models performed deductive reasoning and abductive reasoning based on each sample respectively, generating a large number of pseudo-samples; [Erlangshen-Roberta-330M-Causal-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Roberta-330M-Causal-Chinese) scored the causality of the pseudo-samples and selected the training data for itself and the generative models in the next iteration. + +## 加载模型 Loading Models + +```shell +git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git +cd Fengshenbang-LM +``` + +```python +from fengshen.models.transfo_xl_reasoning import TransfoXLModel +from transformers import T5Tokenizer as TransfoXLTokenizer +deduction_model = TransfoXLModel.from_pretrained('IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese') +abduction_model = TransfoXLModel.from_pretrained('IDEA-CCNL/Randeng-TransformerXL-5B-Abduction-Chinese') +tokenizer = TransfoXLTokenizer.from_pretrained( + "IDEA-CCNL/Randeng-TransformerXL-5B-Deduction-Chinese", + eos_token='<|endoftext|>', + pad_token='<|endoftext|>', + extra_ids=0 +) +tokenizer.add_special_tokens({'bos_token': ''}) +``` + +## 使用示例 Usage Example + +```python +from fengshen.models.transfo_xl_reasoning import deduction_generate, abduction_generate +input_text = "机器人统治世界" +input_texts = ["机器人统治世界", "玉米价格持续上涨"] +print(deduction_generate(deduction_model, tokenizer, input_text, device=0)) +print(deduction_generate(deduction_model, tokenizer, input_texts, device=0)) +print(abduction_generate(abduction_model, tokenizer, input_text, device=0)) +print(abduction_generate(abduction_model, tokenizer, input_texts, device=0)) +``` + +## 一些例子 Some Cases + +(以下内容为模型生成,不代表本单位观点) + +**因果推理 Deduction** +1. 鲸鱼是哺乳动物,哺乳动物是恒温动物 + - 鲸鱼拥有极强的恒温能力 + - 鲸鱼需要消耗大量的能量 + - 鲸鱼能在海洋中生存下来,完全依靠其体温恒定 +2. 鲸鱼是哺乳动物,哺乳动物不是恒温动物 + - 鲸鱼不是恒温动物,鲸鱼是变温动物 + - 鲸鱼会随温度变化而变化 + - 鲸鱼有冬眠现象 +3. 机器人统治世界 + - 机器人取代人类做大部分工作 + - 机器人世界的法律必须能够适用于机器人统治的现实世界 + - 机器人必须能够相互沟通,并能够处理人类无法处理的复杂情况 + - 未来将不再需要大量工人,机器人将接管工厂 +4. 玉米价格持续上涨 + - 玉米淀粉价格也呈现上涨趋势 + - 玉米种植效益不断攀升 + - 在玉米深加工行业引起了一阵骚动 +5. 实体经济融资难、融资贵 + - 急需发展互联网金融等金融业态,为实体经济提供融资服务 + - 融资需求向金融资产转移,增加了金融资产供给 + - 必须大力发展资本市场,使资本市场成为经济转型的助推器 +6. 影响华北地区的冷空气势力偏弱 + - 冷空气的影响时间将偏短 + - 冷空气影响结束后,华北地区气温会继续缓慢回升 + - 华北地区气温较常年同期偏高 + +**反绎推理 Abduction** +1. 玉米价格持续上涨 + - 玉米库存较低,需求增加 + - 东北地区受降雨天气影响,玉米生长受到影响 + - 今年玉米种植面积大幅度下降 +2. 玉米价格下跌 + - 玉米的库存量大,需求量低 + - 今年玉米产量创新高,而需求不足 + - 目前玉米市场处于供大于求的状态,再加上近期华北地区遭遇了强降雨天气,玉米质量下降 +3. 农作物大量死亡 + - 旱灾持续时间长,又无雨,土壤干裂,作物得不到水分 + - 霜冻来临,气温骤降,植物受冻 + - 许多农民为了使农作物能够长得更好,使用更多的农药,并且没有合理的休耕措施 +4. 鲸鱼需要消耗大量的能量 + - 鲸鱼的体型庞大,新陈代谢速度又快 + - 鲸鱼的身体结构特殊,需要消耗大量的能量来维持身体结构的稳定 +5. 实体经济融资难、融资贵 + - 融资渠道单一,实体经济难以获得充足的资金 + - 实体经济融资主要依赖抵押、担保、信贷等间接融资方式,存在抵押物不足、担保机制不完善等问题 + - 实体经济往往需要大量的资金,而银行受制于风险控制、资本充足率等要求,很难大量发放贷款 +6. 火山爆发导致植物死亡 + - 火山灰会阻碍植物吸收阳光 + - 火山灰的飘散,导致植物无法吸收到足够的氧气 + - 火山喷发时,岩浆温度极高,植物无法承受 + + +## 引用 Citation + +如果您在您的工作中使用了我们的模型,可以引用我们的[论文](https://arxiv.org/abs/2209.02970): + +If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2209.02970): + +```text +@article{fengshenbang, + author = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang}, + title = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence}, + journal = {CoRR}, + volume = {abs/2209.02970}, + year = {2022} +} +``` + +也可以引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/): + +You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/): + +```text +@misc{Fengshenbang-LM, + title={Fengshenbang-LM}, + author={IDEA-CCNL}, + year={2021}, + howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}}, +} +``` \ No newline at end of file diff --git a/fengshen/examples/sequence_tagging/finetune_sequence_tagging.py b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.py new file mode 100644 index 0000000000000000000000000000000000000000..a4ca513231810e3c7020e1ee4657c53ce286a5e7 --- /dev/null +++ b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.py @@ -0,0 +1,317 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from dataclasses import dataclass +import copy +import logging +import torch.nn.functional as F +import os +import json +import torch +import pytorch_lightning as pl +import argparse +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor +from torch.utils.data import Dataset, DataLoader +from torch.utils.data._utils.collate import default_collate +from fengshen.models.tagging_models.bert_for_tagging import BertLinear,BertCrf,BertSpan,BertBiaffine +from fengshen.data.sequence_tagging_dataloader.sequence_tagging_collator import CollatorForLinear, CollatorForCrf, CollatorForSpan, CollatorForBiaffine +from fengshen.data.sequence_tagging_dataloader.sequence_tagging_datasets import DataProcessor, get_datasets +from fengshen.metric.metric import EntityScore +from fengshen.models.model_utils import configure_optimizers, get_total_steps +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from fengshen.data.universal_datamodule import UniversalDataModule + +from transformers import ( + BertTokenizer, BertConfig, AutoTokenizer +) +from fengshen.metric.utils_ner import get_entities, bert_extract_item + + +_model_dict={ + 'bert-linear': BertLinear, + 'bert-crf': BertCrf, + 'bert-span': BertSpan, + 'bert-biaffine': BertBiaffine +} + +_collator_dict={ + 'linear': CollatorForLinear, + 'crf': CollatorForCrf, + 'span': CollatorForSpan +} + +_validation_dict={ + 'linear': 'validation_linear', + 'crf': 'validation_crf', + 'span': 'validation_span', + 'biaffine': 'validation_biaffine', +} + +_prediction_dict={ + 'linear': 'predict_linear', + 'crf': 'predict_crf', + 'span': 'predict_span', + 'biaffine': 'predict_biaffine', +} + +logger = logging.getLogger(__name__) + + +class LitModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument("--max_seq_length", default=512, type=int) + parser.add_argument('--data_dir', default=None, type=str) + parser.add_argument('--model_type', default='bert', type=str) + parser.add_argument("--decode_type", default="linear", choices=["linear", "crf", "biaffine", "span"], type=str) + parser.add_argument('--loss_type', default='ce', type=str, choices=['lsr', 'focal', 'ce']) + return parent_args + + def __init__(self, args, id2label, tokenizer): + super().__init__() + + self.model_name=args.model_type+"-"+args.decode_type + self.id2label = id2label + + self.config=BertConfig.from_pretrained(args.model_path) + self.tokenizer = tokenizer + self.model = _model_dict[self.model_name].from_pretrained(args.model_path, config=self.config, num_labels=len(self.id2label), loss_type=args.loss_type) + self.entity_score=EntityScore() + + self.validate_fn=getattr(self,_validation_dict[args.decode_type]) + self.predict_fn=getattr(self,_prediction_dict[args.decode_type]) + + self.predict_result=[] + + self.save_hyperparameters(args) + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + print('Total steps: {}' .format(self.total_steps)) + + def training_step(self, batch, batch_idx): + outputs = self.model(**batch) + loss = outputs.loss + self.log('train_loss', loss) + return loss + + def validation_step(self, batch, batch_idx): + self.validate_fn(batch,batch_idx) + + def validation_linear(self, batch, batch_idx): + outputs = self.model(**batch) + loss = outputs.loss + logits = outputs.logits + + preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2) + preds = preds.detach().cpu().numpy() + labels = batch['labels'].detach().cpu().numpy() + + for i, label in enumerate(labels): + y_true = [] + y_pred = [] + for j, m in enumerate(label): + if j == 0: + continue + elif j == (torch.sum(batch['attention_mask'][i]).item()-1): + true_subject=get_entities(y_true,self.id2label) + pred_subject=get_entities(y_pred,self.id2label) + self.entity_score.update(true_subject=true_subject, pred_subject=pred_subject) + break + else: + y_true.append(self.id2label[labels[i][j]]) + y_pred.append(self.id2label[preds[i][j]]) + + self.log('val_loss', loss) + + def validation_crf(self, batch, batch_idx): + outputs = self.model(**batch) + loss = outputs.loss + logits = outputs.logits + + preds = self.model.crf.decode(logits, batch['attention_mask']) + preds = preds.detach().squeeze(0).cpu().numpy().tolist() + labels = batch['labels'].detach().cpu().numpy() + + for i, label in enumerate(labels): + y_true = [] + y_pred = [] + for j, m in enumerate(label): + if j == 0: + continue + elif j == (torch.sum(batch['attention_mask'][i]).item()-1): + true_subject=get_entities(y_true,self.id2label) + pred_subject=get_entities(y_pred,self.id2label) + self.entity_score.update(true_subject=true_subject, pred_subject=pred_subject) + break + else: + y_true.append(self.id2label[labels[i][j]]) + y_pred.append(self.id2label[preds[i][j]]) + + self.log('val_loss', loss) + + def validation_span(self, batch, batch_idx): + outputs = self.model(**batch) + loss = outputs.loss + start_logits = outputs.start_logits + end_logits = outputs.end_logits + labels=batch['subjects'] + for i, T in enumerate(labels): + active_start_logits=start_logits[i][:batch['input_len'][i]] + active_end_logits=end_logits[i][:batch['input_len'][i]] + R = bert_extract_item(active_start_logits, active_end_logits) + + T=T[~torch.all(T==-1,dim=-1)].cpu().numpy() + T=list(map(lambda x:(self.id2label[x[0]],x[1],x[2]),T)) + R=list(map(lambda x:(self.id2label[x[0]],x[1],x[2]),R)) + + self.entity_score.update(true_subject=T, pred_subject=R) + self.log('val_loss', loss) + + def validation_biaffine(self, batch, batch_idx): + outputs = self.model(**batch) + loss = outputs.loss + logits = outputs.span_logits + + preds = torch.argmax(logits.cpu().numpy(), axis=-1) + labels = batch['span_labels'].cpu().numpy() + + for i, label in enumerate(labels): + input_len=(batch['input_len'][i])-2 + active_label=labels[i,1:input_len+1,1:input_len+1] + active_pred=preds[i,1:input_len+1,1:input_len+1] + + temp_1 = [] + temp_2 = [] + + for j in range(input_len): + for k in range(input_len): + if self.id2label[active_label[j,k]]!="O": + temp_1.append([self.id2label[active_label[j,k]],j,k]) + if self.id2label[active_pred[j,k]]!="O": + temp_2.append([self.id2label[active_pred[j,k]],j,k]) + + self.entity_score.update(pred_subject=temp_2, true_subject=temp_1) + + self.log('val_loss', loss) + + def validation_epoch_end(self, outputs): + # compute metric for all process + score_dict, _ = self.entity_score.result() + if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0: + print('score_dict:\n', score_dict) + # reset the metric after once validation + self.entity_score.reset() + for k, v in score_dict.items(): + self.log('val_{}'.format(k), v) + + def predict_step(self, batch, batch_idx): + batch['labels'] = None + outputs = self.model(**batch) + + self.predict_fn(batch,batch_idx) + + def predict_linear(self, batch, outputs): + logits = torch.argmax(F.log_softmax(outputs.logits, dim=2), dim=2) + preds = logits.detach().cpu().numpy() + + for i, pred in enumerate(preds): + text = self.tokenizer.convert_ids_to_tokens(batch['input_ids'][i])[:batch['input_len'][i]][1:-1] + pred = pred[:batch['input_len'][i]][1:-1] + label_entities = get_entities(pred, self.id2label) + for label_list in label_entities: + label_list.append("".join(text[label_list[1]:label_list[2]+1])) + + self.predict_result.extend(label_entities) + + def predict_crf(self, batch, batch_idx): + logits = self.model(**batch).logits + preds = self.model.crf.decode(logits, batch['attention_mask']).squeeze(0).cpu().numpy().tolist() + + for i, pred in enumerate(preds): + text = self.tokenizer.convert_ids_to_tokens(batch['input_ids'][i])[:batch['input_len'][i]][1:-1] + pred = pred[:batch['input_len'][i]][1:-1] + label_entities = get_entities(pred, self.id2label) + for label_list in label_entities: + label_list.append("".join(text[label_list[1]:label_list[2]+1])) + + self.predict_result.extend(label_entities) + + def predict_span(self, batch, batch_idx): + batch['start_positions'] = None + batch['end_positions'] = None + outputs = self.model(**batch) + + start_logits, end_logits = outputs.start_logits, outputs.end_logits + for i, _ in enumerate(start_logits): + text = self.tokenizer.convert_ids_to_tokens(batch['input_ids'][i])[:batch['input_len'][i]][1:-1] + R = bert_extract_item(start_logits[i][:batch['input_len'][i]], end_logits[i][:batch['input_len'][i]]) + if R: + label_entities = [[self.id2label[x[0]],x[1],x[2],"".join(text[x[1]:x[2]+1])] for x in R] + else: + label_entities = [] + + self.predict_result.extend(label_entities) + + + + def configure_optimizers(self): + return configure_optimizers(self) + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + + # * Args for data preprocessing + total_parser = UniversalDataModule.add_data_specific_args(total_parser) + # * Args for training + total_parser = pl.Trainer.add_argparse_args(total_parser) + total_parser = UniversalCheckpoint.add_argparse_args(total_parser) + + # * Args for base model + from fengshen.models.model_utils import add_module_args + total_parser = add_module_args(total_parser) + total_parser = LitModel.add_model_specific_args(total_parser) + + args = total_parser.parse_args() + + datasets=get_datasets(args) + + checkpoint_callback = UniversalCheckpoint(args).callbacks + lr_monitor = LearningRateMonitor(logging_interval='step') + + trainer = pl.Trainer.from_argparse_args(args, + callbacks=[checkpoint_callback, lr_monitor] + ) + + label2id,id2label=DataProcessor.get_labels(args) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + + collator = _collator_dict[args.decode_type]() + collator.args=args + collator.tokenizer=tokenizer + collator.label2id=label2id + data_model = UniversalDataModule(tokenizer,collator,args,datasets) + + model = LitModel(args,id2label,tokenizer) + print(label2id) + trainer.fit(model, data_model) + # trainer.predict(model,dataloaders=data_model.predict_dataloader()) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/fengshen/examples/sequence_tagging/finetune_sequence_tagging.sh b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.sh new file mode 100644 index 0000000000000000000000000000000000000000..a477ed89852a4ec96139e85d7e44ed476aaeab76 --- /dev/null +++ b/fengshen/examples/sequence_tagging/finetune_sequence_tagging.sh @@ -0,0 +1,83 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_cmeee # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/lujunyu/experiments/ner_finetune/zen2_base_cmeee/%x-%j.log # output and error file name (%x=job name, %j=job id) +#SBATCH -p hgx + + +ROOT_DIR=../../workspace +export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions + +MODEL_NAME=ner_bert_base +TASK=cmeee + +MODEL_NAME=bert-base +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} +fi + +NNODES=1 +GPUS_PER_NODE=1 + +MICRO_BATCH_SIZE=16 + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +DATA_ARGS="\ + --num_workers 8 \ + --dataloader_workers 8 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + " + +MODEL_ARGS="\ + --model_path $MODEL_ROOT_DIR/pretrain \ + --data_dir /cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo \ + --model_type bert \ + --decode_type linear \ + --learning_rate 5e-5 \ + --weight_decay 0.05 \ + --warmup_ratio 0.1 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --save_top_k -1 \ + --save_last \ + --every_n_train_steps 100 \ + --save_ckpt_path ${MODEL_ROOT_DIR} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --check_val_every_n_epoch 1 \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --replace_sampler_ddp False \ + " + + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" + +python3 finetune_sequence_tagging.py $options + + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/stable_diffusion_chinese/README.md b/fengshen/examples/stable_diffusion_chinese/README.md new file mode 100644 index 0000000000000000000000000000000000000000..edfb1d354bb2f98c1d24841ae86a70f07ceee37a --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese/README.md @@ -0,0 +1,99 @@ +# Taiyi-Stable-Diffusion-1B-Chinese-v0.1 + +- Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM) +- Docs: [Fengshenbang-Docs](https://fengshenbang-doc.readthedocs.io/) + +## 简介 Brief Introduction + +首个开源的中文Stable Diffusion模型,基于0.2亿筛选过的中文图文对训练。 + +The first open source Chinese Stable diffusion, which was trained on 20M filtered Chinese image-text pairs. + +## 模型分类 Model Taxonomy + +| 需求 Demand | 任务 Task | 系列 Series | 模型 Model | 参数 Parameter | 额外 Extra | +| :----: | :----: | :----: | :----: | :----: | :----: | +| 特殊 Special | 多模态 Multimodal | 太乙 Taiyi | Stable Diffusion | 1B | Chinese | + +## 模型信息 Model Information + +我们将[Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)数据集(100M)和[Zero](https://zero.so.com/)数据集(23M)用作预训练的数据集,先用[IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)对这两个数据集的图文对相似性进行打分,取CLIP Score大于0.2的图文对作为我们的训练集。 我们使用[IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)作为初始化的text encoder,冻住[stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([论文](https://arxiv.org/abs/2112.10752))模型的其他部分,只训练text encoder,以便保留原始模型的生成能力且实现中文概念的对齐。该模型目前在0.2亿图文对上训练了一个epoch。 我们在 32 x A100 训练了大约100小时。该版本只是一个初步的版本,我们将持续优化并开源后续模型,欢迎交流。 + +We use [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)(100M) 和 [Zero](https://zero.so.com/)(23M) as our dataset, and take the image and text pairs with CLIP Score (based on [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)) greater than 0.2 as our Training set. We use [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese) as our init text encoder. To keep the powerful generative capability of stable diffusion and align Chinese concepts with the images, We only train the text encoder and freeze other part of the [stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([paper](https://arxiv.org/abs/2112.10752)) model. It takes 100 hours to train this model based on 32 x A100. This model is a preliminary version and we will update this model continuously and open sourse. Welcome to exchange! + +### Result +Basic Prompt + +| 铁马冰河入梦来,3D绘画。 | 飞流直下三千尺,油画。 | 女孩背影,日落,唯美插画。 | +| ---- | ---- | ---- | +| ![](result_examples/tiema.png) | ![](result_examples/feiliu.png) | ![](result_examples/nvhai.jpg) | + +Advanced Prompt + +| 铁马冰河入梦来,概念画,科幻,玄幻,3D | 中国海边城市,科幻,未来感,唯美,插画。 | 那人却在灯火阑珊处,色彩艳丽,古风,资深插画师作品,桌面高清壁纸。 | +| ---- | ---- | ---- | +| ![](result_examples/tiema2.jpg) | ![](result_examples/chengshi.jpg) | ![](result_examples/naren.jpg) | + + +## 使用 Usage + +### 全精度 Full precision + +```py +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1").to("cuda") + +prompt = '飞流直下三千尺,油画' +image = pipe(prompt, guidance_scale=7.5).images[0] +image.save("飞流.png") +``` + +### 半精度 Half precision FP16 (CUDA) + +添加 `torch_dtype=torch.float16` 和 `device_map="auto"` 可以快速加载 FP16 的权重,以加快推理速度。 +更多信息见 [the optimization docs](https://huggingface.co./docs/diffusers/main/en/optimization/fp16#half-precision-weights)。 + +```py +# !pip install git+https://github.com/huggingface/accelerate +import torch +from diffusers import StableDiffusionPipeline +torch.backends.cudnn.benchmark = True +pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1", torch_dtype=torch.float16) +pipe.to('cuda') + +prompt = '飞流直下三千尺,油画' +image = pipe(prompt, guidance_scale=7.5).images[0] +image.save("飞流.png") +``` + + +## 引用 Citation + +如果您在您的工作中使用了我们的模型,可以引用我们的[总论文](https://arxiv.org/abs/2209.02970): + +If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2209.02970): + +```text +@article{fengshenbang, + author = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang}, + title = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence}, + journal = {CoRR}, + volume = {abs/2209.02970}, + year = {2022} +} +``` + +也可以引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/): + +You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/): + +```text +@misc{Fengshenbang-LM, + title={Fengshenbang-LM}, + author={IDEA-CCNL}, + year={2021}, + howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}}, +} +``` + diff --git a/fengshen/examples/stable_diffusion_chinese/img/hf_stable_blog.png b/fengshen/examples/stable_diffusion_chinese/img/hf_stable_blog.png new file mode 100644 index 0000000000000000000000000000000000000000..1d8e3e97a5a2c8d324a92d5e0e26efea324c46de Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/img/hf_stable_blog.png differ diff --git a/fengshen/examples/stable_diffusion_chinese/img/seed.png b/fengshen/examples/stable_diffusion_chinese/img/seed.png new file mode 100644 index 0000000000000000000000000000000000000000..8d82a8128a65a626f4c48867f6e99540e4970d0d Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/img/seed.png differ diff --git a/fengshen/examples/stable_diffusion_chinese/img/test.md b/fengshen/examples/stable_diffusion_chinese/img/test.md new file mode 100644 index 0000000000000000000000000000000000000000..c8b1b42336f7e2c26898b5b99441d324f2de5412 --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese/img/test.md @@ -0,0 +1 @@ +delete diff --git a/fengshen/examples/stable_diffusion_chinese/img/ui.png b/fengshen/examples/stable_diffusion_chinese/img/ui.png new file mode 100644 index 0000000000000000000000000000000000000000..8c34d14cb0fc9379f67f865abd3f3ec33c46bc1b Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/img/ui.png differ diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212.png" new file mode 100644 index 0000000000000000000000000000000000000000..996adacb1736d516defaa177a71c4545cf738df1 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef1d12d9fae3580549b54aebdb8454fb12e5e2dd7f7c61540a0149d40d071998 +size 2581243 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267.png" new file mode 100644 index 0000000000000000000000000000000000000000..65a5ca7d0e4bc626064476c687378ba2ca23ffce --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5226485d0ff3dd1b5d0a7c0b63e7b907aa221b364e84b137ce493ea35ea3d18b +size 2515750 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267\350\257\215\346\261\207.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267\350\257\215\346\261\207.png" new file mode 100644 index 0000000000000000000000000000000000000000..066a6106a60ed369891714ef2b68cd1f8f1d9e41 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212nega\345\271\277\345\221\212\347\254\246\345\217\267\350\257\215\346\261\207.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b8580dd7b305d7c15ae1337da6a88d9f8a05486a822aa13858eff7076d37e72 +size 2549987 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\345\217\245\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\345\217\245\345\217\267.png" new file mode 100644 index 0000000000000000000000000000000000000000..3256b5982218860cd70bae3a5f851752ecebc20c --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\345\217\245\345\217\267.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3312a017440c8a9f2175d9bdc66a55888ebd75ef79709b664e12e290bc86edf4 +size 2494122 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\346\204\237\345\217\271\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\346\204\237\345\217\271\345\217\267.png" new file mode 100644 index 0000000000000000000000000000000000000000..e0b5607e01d9734fc08f639bdae4ae2c95d8c2fc --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\346\204\237\345\217\271\345\217\267.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80edee518f20651a7e21196cbd5cb65422c18e60586782f147a0a8f6305cc2a3 +size 2508439 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267.png" new file mode 100644 index 0000000000000000000000000000000000000000..d0255d44e71ec407dd3e3044631f6c7b53de3744 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:911a5aeeeab3cf06387b6ada29afd40eea56a468dfda6fc71ce75273339dd84b +size 2480902 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267\346\240\207\350\256\260.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267\346\240\207\350\256\260.png" new file mode 100644 index 0000000000000000000000000000000000000000..720f33bd662470181aaa8fa90adefe0af84dfafd Binary files /dev/null and "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\344\270\255\346\226\207\351\200\227\345\217\267\346\240\207\350\256\260.png" differ diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267.png" new file mode 100644 index 0000000000000000000000000000000000000000..75f465dcf29526f60fb60036c9299f6636bf9be0 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbf28c2b24f38ceec6957b524d120f6e4ba517db73f8bab46d85cdd26bbcdce8 +size 2609499 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270.png" new file mode 100644 index 0000000000000000000000000000000000000000..823973acc84f432e07965e4bde318e2d25322923 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:820341bdc8061c15bc1418f2e576d4b1b19212398d3a5f554dc759f23564f575 +size 2700124 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270256.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270256.png" new file mode 100644 index 0000000000000000000000000000000000000000..24e554d0d9c642867fab2c2884f37a17ce0bcb96 Binary files /dev/null and "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270256.png" differ diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270384.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270384.png" new file mode 100644 index 0000000000000000000000000000000000000000..2e6f2c6269b0b4ce79c57b8c9bcb4b54685230db --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270384.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b6ea049e5788fee0145d06cfb4056fed061a68a956a37361cb19ceed434a23 +size 1540886 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\345\244\215\346\235\202.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\345\244\215\346\235\202.png" new file mode 100644 index 0000000000000000000000000000000000000000..ecea935526ade3aa7ffdbdda31fe8b2ee022f3bf --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\345\244\215\346\235\202.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479ef4a81477c11f282ce791fd569d28d4b8c1016fae768e30334e85cbb2e7c0 +size 2809505 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\347\262\276\347\273\206.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\347\262\276\347\273\206.png" new file mode 100644 index 0000000000000000000000000000000000000000..bab90e44e01e5b235f060acb3bb178245bea57ad --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\347\262\276\347\273\206.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e68da5d3b8cafd3d8339a8047eb2540d3a6827b8779877254cc1eabee8ffda4a +size 2791607 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\351\253\230\346\270\205.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\351\253\230\346\270\205.png" new file mode 100644 index 0000000000000000000000000000000000000000..9a746e8967b9e3bdc8d484d8b4bef1a2a6d30810 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\2674k\345\243\201\347\272\270\351\253\230\346\270\205.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dba3a4e18aacc50694454fe3475b3e407d94c35312b98cc3e84091ddae20464 +size 2746087 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\217\222\347\224\273.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\217\222\347\224\273.png" new file mode 100644 index 0000000000000000000000000000000000000000..f129b44b34dd5f0be9b3622d7ab495add79d90d9 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\217\222\347\224\273.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dbf987b31c1b264dcebb1605357ad2bd3ee41ff953ab7a4f155554284254751 +size 2796841 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\260\264\345\275\251.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\260\264\345\275\251.png" new file mode 100644 index 0000000000000000000000000000000000000000..481e4bf5d6db76427026d741b12d616c1b6c2c74 --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\260\264\345\275\251.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49fd318fd749f205b87742743a19d3efb2999fb6c6cc95ebdabf971cb0be7b1d +size 2810533 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\262\271\347\224\273.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\262\271\347\224\273.png" new file mode 100644 index 0000000000000000000000000000000000000000..59f0c85aecfa7d389b4905ace2adb88f49c29b1b --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\346\262\271\347\224\273.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9425ab60933c4e237a8c86f1754fbe5de7e989638b2bce509db5dd87bfc4aad +size 2941964 diff --git "a/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\347\264\240\346\217\217.png" "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\347\264\240\346\217\217.png" new file mode 100644 index 0000000000000000000000000000000000000000..0abc8554d99d18fe4aeb7e907cc8215994fb666b --- /dev/null +++ "b/fengshen/examples/stable_diffusion_chinese/img/\346\227\245\345\207\272\357\274\214\346\265\267\351\235\242\344\270\212\350\213\261\346\226\207\351\200\227\345\217\267\347\264\240\346\217\217.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e507647da9b443096e33d65dbd389e0e8e94f2b96bbbadd84b997f22e53e608c +size 2451943 diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/chengshi.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/chengshi.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6c164a7715d3dad11fd992a65967a6be9ec129e9 Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/chengshi.jpg differ diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/feiliu.png b/fengshen/examples/stable_diffusion_chinese/result_examples/feiliu.png new file mode 100644 index 0000000000000000000000000000000000000000..eef1ec05852d686ef4476ad70f78a456d814cbca Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/feiliu.png differ diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/naren.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/naren.jpg new file mode 100644 index 0000000000000000000000000000000000000000..646e0e9b3c464669483a4f19d6f3c438b658979c Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/naren.jpg differ diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/nvhai.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/nvhai.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cd032bc8ceab786bd5f9f51a58cc07e0ab4ce64d Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/nvhai.jpg differ diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/tiema.png b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema.png new file mode 100644 index 0000000000000000000000000000000000000000..b7c806c456c4cb08039da2d7ceffba776090e498 Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema.png differ diff --git a/fengshen/examples/stable_diffusion_chinese/result_examples/tiema2.jpg b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..de568bdb7ac108499d18a12c82aba98e51ddca70 Binary files /dev/null and b/fengshen/examples/stable_diffusion_chinese/result_examples/tiema2.jpg differ diff --git a/fengshen/examples/stable_diffusion_chinese/taiyi_handbook.md b/fengshen/examples/stable_diffusion_chinese/taiyi_handbook.md new file mode 100644 index 0000000000000000000000000000000000000000..2849521e6ec23b8b116974bb601cdc400b1a216a --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese/taiyi_handbook.md @@ -0,0 +1,425 @@ +# 太乙绘画使用手册1.0——AI人类助理入职指南 + +版本:2022.11.20 (Ver 1) + +编撰团队:IDEA CCNL 封神榜团队 +团队主页:https://github.com/IDEA-CCNL/Fengshenbang-LM + +腾讯文档版本:太乙绘画使用手册1.0 https://docs.qq.com/doc/DWklwWkVvSFVwUE9Q + +感谢所有参与编撰以及投稿的“助理们”!(微信搜索:fengshenbang-lm) + +**特别感谢名单(排名按投稿时间顺序):** +王军杰,甘如饴,陈伟峰,李夏禹,高昕宇, + +
+ +# 目录 +- [太乙绘画使用手册1.0——AI人类助理入职指南](#太乙绘画使用手册10ai人类助理入职指南) +- [目录](#目录) +- [前言](#前言) +- [入门手册(如何写一个优秀的提示词)](#入门手册如何写一个优秀的提示词) + - [懒人简洁版](#懒人简洁版) + - [一些基础准备](#一些基础准备) + - [一个逗号引发的水印](#一个逗号引发的水印) + - [反向prompt negative](#反向prompt-negative) + - [赋予某种属性(4k壁纸, 插画, 油画等)消除白边](#赋予某种属性4k壁纸-插画-油画等消除白边) + - [增加细节](#增加细节) + - [画幅(512×512)](#画幅512512) +- [引用](#引用) +- [联系我们](#联系我们) +- [版权许可](#版权许可) + +
+ +# 前言 + +本手册追求仅使用**自然语言**就可以生成**好看的**图片。 + +这是一本**免费的、开源的**手册,我们乐意于**接受每个人的投稿**,一同完善本手册。 + +本手册旨在提供一些关于中文文生图模型(太乙系列)的一些神奇的文本提示词,并且分享我们的一些神奇的发现(规则)。 + +本手册包括两大部分: +- 入门手册:提示词基础写法以及原理 +- 效果图册:一些我们觉得好看的图和对应的prompt + +本使用手册使用环境为: +- 模型 +https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1 + +- 环境 +WebUI +相关Github: https://github.com/IDEA-CCNL/Fengshenbang-LM/issues/186 + +参考:https://docs.qq.com/doc/DWHl3am5Zb05QbGVs + +
+ +# 入门手册(如何写一个优秀的提示词) + +![avatar](img/ui.png) + +
+ +## 懒人简洁版 +___ +
+ +提示词 Prompt: +> 不能出现中文的标点符号,比如中文的逗号,中文句号。并且需要赋予这幅画某种属性。 +> +> 如:长河落日圆, 4k壁纸 +> +
+ +反向提示词 Negative prompt: +> 一些负面词汇 +> +> 通用反向提示词:广告, ,, !, 。, ;, 资讯, 新闻, 水印 + +
+画幅大小设置为512×512最佳。 + + +
+ +## 一些基础准备 +___ +
+ +以下实验的随机种子均为:1419200315 + +![avatar](img/ui.png) + +
+ +## 一个逗号引发的水印 +___ +
+ +我们来看看什么都不改会是咋样的。 + +日出,海面上 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上中文逗号.png) + +
+ +可以看到,其实是会出现水印,以及画幅不满的问题的。 + +![avatar](img/日出,海面上中文逗号标记.png) + +
+ +那我们把中文逗号换成英文逗号呢? + +日出, 海面上 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号.png) + +
+ +!!!神奇的事情出现了,水印消失了! + +
+ +会不会是标点符号的问题?所以我在上述是英文逗号的基础下,添加一个中文的句号作为结尾。 + +![avatar](img/日出,海面上中文句号.png) + +没错,神奇的事情出现了,水印回来了,而且位置一模一样。 + +
+ +我甚至可以弄出更多的水印,比如加中文的感叹号。 + +日出, 海面上! +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上中文感叹号.png) + +所以,一个重要的结论为,中文的标点符号是和水印有着某种强相关的联系的! + +因此,我们输入提示词时,应该**不用任何中文标点符号**。 + +
+ +## 反向prompt negative +___ +
+ +基本上就是把一些不好的词全加进去。 + +我们的原图为: + +日出, 海面上 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号.png) + +
+ +日出, 海面上 +Negative prompt: 广告 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上nega广告.png) + +
+ +加上了广告之后,画面的表现力要好一些,比如图5的山的轮廓更好了。 + +根据之前的一些经验,把中文标点都放上去 + +
+ +日出, 海面上 +Negative prompt: 广告, ,, !, 。, ; +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上nega广告符号.png) + +
+ +细节更多了点 + +
+ +日出, 海面上 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上nega广告符号词汇.png) + +
+ +所以,我们的反向提示词选择: **广告, ,, !, 。, ;, 资讯, 新闻, 水印** + +
+ +## 赋予某种属性(4k壁纸, 插画, 油画等)消除白边 +___ +
+ +我们的原图为: + +
+ +日出, 海面上 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号.png) + +
+ +我们添加了某种属性,比如 4k壁纸 之后: + +**4k壁纸** + +日出, 海面上, 4k壁纸 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸.png) + +
+ +**interesting!图3的白边不见了!** + +
+ +一个可能的解释是,我们的训练数据中,用的是resize的方法来调整输入的图片,而这样做,对于边长小于512的图,会自动保留白边。而这也就导致了我们的生成会有。但是一旦给这幅画赋予了某种属性,就可以避免这件事了。 + +
+ +(注,我试过3k壁纸和8k壁纸,都不行,估计是语料是真的没有。我试过 壁纸,这个prompt看起来不高清。) + +
+ +试试看别的属性 + +
+ +**插画** + +日出, 海面上, 插画 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号插画.png) + +
+ +插画,其实是什么画风都有,但是总体来说是画。 + +
+ +**油画** + +日出, 海面上, 油画 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号油画.png) + +
+ +虽然图3出现了画框,但是一幅油画,包括了画框也是正常。 + +
+ +**水彩** + +日出, 海面上, 水彩 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号水彩.png) + +
+ +**素描** + +日出, 海面上, 素描 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号素描.png) + + +
+ +## 增加细节 +___ +
+ +ok,我们回退一下。 + +
+ +日出, 海面上, 4k壁纸 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸.png) + +
+ +我们希望更多的细节呢? + +
+ +**复杂** + +日出, 海面上, 4k壁纸, 复杂 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸复杂.png) + +
+ +可以看到,复杂是一定作用的,所有图的细节都增加了。 + +
+ +**精细** + +日出, 海面上, 4k壁纸, 精细 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸精细.png) + +
+ +精细 的做法反而是把不少细节都选择了平滑处理。过度更加柔和。 + +
+ +**高清** + +日出, 海面上, 4k壁纸, 高清 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸高清.png) + +
+ +只多了一点点细节,图2的海面上多了光斑,这么一说也许是光影效果好了一些。 + + +
+ +## 画幅(512×512) +___ +
+ +不同的画幅也会影响生成的内容和质量。 + +参考自:https://huggingface.co./blog/stable_diffusion + +![avatar](img/hf_stable_blog.png) + +
+ +在stable diffusion中也有这个相关的发现,512*512是最好的画幅。 + +
+ +我们看看正常的: + +
+ +**512*512** + +日出, 海面上, 4k壁纸 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 512x512, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸.png) + +
+ +**384*384** + +日出, 海面上, 4k壁纸 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 384x384, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸384.png) + +
+ +低画幅会导致画面莫名撕裂,出图非常毛躁。 + +
+ +**256*256** + +如果我们进一步降低画质,会非常非常撕裂: + +日出, 海面上, 4k壁纸 +Negative prompt: 广告, ,, !, 。, ;, 资讯, 新闻, 水印 +Steps: 20, Sampler: PLMS, CFG scale: 7, Seed: 1419200315, Size: 256x256, Model hash: e2e75020, Batch size: 6, Batch pos: 0 + +![avatar](img/日出,海面上英文逗号4k壁纸256.png) + +# 引用 + +``` +@misc{Fengshenbang-LM, + title={Fengshenbang-LM}, + author={IDEA-CCNL}, + year={2021}, + howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}}, +} +``` + +# 版权许可 + +[Apache License 2.0](LICENSE) diff --git a/fengshen/examples/stable_diffusion_chinese_EN/README.md b/fengshen/examples/stable_diffusion_chinese_EN/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8bd939d901203225ea6902d688769390c7c10cd8 --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese_EN/README.md @@ -0,0 +1,110 @@ +# Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1 + +- Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM) +- Docs: [Fengshenbang-Docs](https://fengshenbang-doc.readthedocs.io/) + +## 简介 Brief Introduction + +首个开源的中英双语Stable Diffusion模型,基于0.2亿筛选过的中文图文对训练。 + +The first open source Chinese&English Bilingual Stable diffusion, which was trained on 20M filtered Chinese image-text pairs. + +## 模型分类 Model Taxonomy + +| 需求 Demand | 任务 Task | 系列 Series | 模型 Model | 参数 Parameter | 额外 Extra | +| :----: | :----: | :----: | :----: | :----: | :----: | +| 特殊 Special | 多模态 Multimodal | 太乙 Taiyi | Stable Diffusion | 1B | Chinese and English | + +## 模型信息 Model Information + +我们将[Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)数据集(100M)和[Zero](https://zero.so.com/)数据集(23M)用作预训练的数据集,先用[IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)对这两个数据集的图文对相似性进行打分,取CLIP Score大于0.2的图文对作为我们的训练集。 我们使用[stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([论文](https://arxiv.org/abs/2112.10752))模型进行继续训练,其中训练分为两个stage。 + +第一个stage中冻住模型的其他部分,只训练text encoder,以便保留原始模型的生成能力且实现中文概念的对齐。 + +第二个stage中将全部模型解冻,一起训练text encoder和diffusion model,以便diffusion model更好的适配中文guidance。 + +第一个stage我们训练了80小时,第二个stage训练了100小时,两个stage都是用了8 x A100。该版本是一个初步的版本,我们将持续优化模型并开源,欢迎交流! + +We use [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/)(100M) 和 [Zero](https://zero.so.com/)(23M) as our dataset, and take the image and text pairs with CLIP Score (based on [IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese](https://huggingface.co./IDEA-CCNL/Taiyi-CLIP-RoBERTa-102M-ViT-L-Chinese)) greater than 0.2 as our Training set. We finetune the [stable-diffusion-v1-4](https://huggingface.co./CompVis/stable-diffusion-v1-4)([paper](https://arxiv.org/abs/2112.10752)) model for two stage. + +Stage 1: To keep the powerful generative capability of stable diffusion and align Chinese concepts with the images, We only train the text encoder and freeze other part of the model in the first stage. + +Stage 2: We unfreeze both the text encoder and the diffusion model, therefore the diffusion model can have a better compatibility for the Chinese language guidance. + +It takes 80 hours to train the first stage, 100 hours to train the second stage, both stages are based on 8 x A100. This model is a preliminary version and we will update this model continuously and open sourse. Welcome to exchange! + +### Result + +小桥流水人家,Van Gogh style。 +![](result_examples/xiaoqiao_vangogh.png) + +小桥流水人家,水彩。 +![](result_examples/xiaoqiao_oil_painting.png) + +吃过桥米线的猫。 +![](result_examples/cat_eating_guoqiao_noodle.png) + +穿着宇航服的哈士奇。 +![](result_examples/huskiy_wearing_space_suit.png) +## 使用 Usage + +### 全精度 Full precision + +```py +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1").to("cuda") + +prompt = '小桥流水人家,Van Gogh style' +image = pipe(prompt, guidance_scale=10).images[0] +image.save("小桥.png") +``` + +### 半精度 Half precision FP16 (CUDA) + +添加 `torch_dtype=torch.float16` 和 `device_map="auto"` 可以快速加载 FP16 的权重,以加快推理速度。 +更多信息见 [the optimization docs](https://huggingface.co./docs/diffusers/main/en/optimization/fp16#half-precision-weights)。 + +```py +# !pip install git+https://github.com/huggingface/accelerate +import torch +from diffusers import StableDiffusionPipeline + +torch.backends.cudnn.benchmark = True +pipe = StableDiffusionPipeline.from_pretrained("IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-EN-v0.1", torch_dtype=torch.float16) +pipe.to('cuda') + +prompt = '小桥流水人家,Van Gogh style' +image = pipe(prompt, guidance_scale=10.0).images[0] +image.save("小桥.png") +``` + + +## 引用 Citation + +如果您在您的工作中使用了我们的模型,可以引用我们的[总论文](https://arxiv.org/abs/2209.02970): + +If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2209.02970): + +```text +@article{fengshenbang, + author = {Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen and Ruyi Gan and Jiaxing Zhang}, + title = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence}, + journal = {CoRR}, + volume = {abs/2209.02970}, + year = {2022} +} +``` + +也可以引用我们的[网站](https://github.com/IDEA-CCNL/Fengshenbang-LM/): + +You can also cite our [website](https://github.com/IDEA-CCNL/Fengshenbang-LM/): + +```text +@misc{Fengshenbang-LM, + title={Fengshenbang-LM}, + author={IDEA-CCNL}, + year={2021}, + howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}}, +} +``` diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png new file mode 100644 index 0000000000000000000000000000000000000000..0c28cf33aaba77e00d357110487947d594b23e43 --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/cat_eating_guoqiao_noodle.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:200db123a5b4f56315480d9e853e950a9c99b020e652cdc6875b57a90b1df9ae +size 2420718 diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..81ba2b54362dcf93a28e6af2f0a13f9610ad0b82 --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/huskiy_wearing_space_suit.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40597ef987aaa22d7b4de63f9b280c587fd8d973ba4a62fddf35f3df472134cb +size 2215674 diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png new file mode 100644 index 0000000000000000000000000000000000000000..86ae863ffe97e37bbfa0231e9033058c9d44d721 --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_oil_painting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7a9780d438506eb151b21d792ff6224f343f39818ed0aadbc786d378b05a54 +size 3006257 diff --git a/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png new file mode 100644 index 0000000000000000000000000000000000000000..f18d210aaf71655b907df7b3c39a6902f4e1942c --- /dev/null +++ b/fengshen/examples/stable_diffusion_chinese_EN/result_examples/xiaoqiao_vangogh.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:867b1cb80acca6540fd45488b7cee37320c4e4340464a9c2f8d5c2ff76fa8e92 +size 3610124 diff --git a/fengshen/examples/stable_diffusion_dreambooth/duck_result.png b/fengshen/examples/stable_diffusion_dreambooth/duck_result.png new file mode 100644 index 0000000000000000000000000000000000000000..1104e1d3183cbfe3dfa1f6ef20e56daa75c07482 --- /dev/null +++ b/fengshen/examples/stable_diffusion_dreambooth/duck_result.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9d752cfd10e54199c248419ac9ea01440527913d5c75227f5e742aa0a5f5787 +size 1409847 diff --git a/fengshen/examples/stable_diffusion_dreambooth/readme.md b/fengshen/examples/stable_diffusion_dreambooth/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..9e73440bc0b9a933a7aaadf912a33b3899ee3f60 --- /dev/null +++ b/fengshen/examples/stable_diffusion_dreambooth/readme.md @@ -0,0 +1,58 @@ +# Taiyi-Stable-Diffusion Dreambooth示例 + +本示例可以应用于[**IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1**](https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1)在自建的数据集上用[**DreamBooth**](https://arxiv.org/abs/2208.12242)的方法进行特定对象的训练,同时稍微修改代码也能够兼容大部分Stable-Diffusion结构。本示例仅提供参考,有任何疑问或者有需要协助的都可以提Issue到本项目中,会有专门的同学解答~ + +## 数据处理 + +在./train_images_duck下有我们进行展示的一个数据集样例 + +## 配置要求 + +[**IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1**](https://huggingface.co./IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1)十亿级别参数进行dreambooth训练,我们自己测试的配置基础如下,batch_size设定为1~2,另外也可以参考train_with_prior.sh进行fp16和deepspeed加速。 + +fp32: + +- 显存:26G以上 +- 内存:64G以上 + +## 运行脚本 + +标准版本 + +sh train.sh + +增加先验版本,具体可以参考[论文](https://arxiv.org/abs/2208.12242) + +sh train_with_prior.sh + +在脚本中也提供了丰富的超参供大家修改,例如batch_size, ckpt_path等等都可以根据自己的需求做更改,其中model_path指向的是huggingface上的模型路径,下载可能比较慢,如果用户已经在本地下载过一份权重,直接将model_path改成本地路径即可。 + +一些常用的参数我们会放在[封神榜的文档里](https://fengshenbang-doc.readthedocs.io/zh/latest/docs/%E5%B0%81%E7%A5%9E%E6%A1%86%E6%9E%B6/%E5%8F%82%E6%95%B0%E7%AE%A1%E7%90%86.html) + +有任何不清楚的地方,不要吝啬你的Issue,直接提过来。 + +## 一些训练中的Trick + +### Deepspeed + +在示例中我们默认开始了Deepspeed,通过Deepspeed我们能提高不少训练效率(即使是单卡)。并且得益于Zero Redundancy Optimizer的技术,在多卡的环境我们能显著的减少显存占用,提高batch_size以获得更高的效率,强烈建议有条件的同学开启Deepspeed。train_with_prior.sh在40G的A100上需要开启deepspeed. + +### 一点经验 + +- 图片选取质量更高,图片背景尽量选取纯色 + +- 对于人脸或者更复杂的可以采用较小的学习率,训练更长的step(800-1200) + +- 目训练的效果不是特别稳定,没有论文中如此惊艳的效果,不过参考[太乙webui的配置](https://github.com/IDEA-CCNL/stable-diffusion-webui/blob/master/README.md)还是能找到很多不错的效果图 + +- 持续探索中... + +![结果](duck_result.png) + +### 参考资料 +https://arxiv.org/abs/2208.12242 + +https://dreambooth.github.io/ + +https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3 + diff --git a/fengshen/examples/stable_diffusion_dreambooth/requirements.txt b/fengshen/examples/stable_diffusion_dreambooth/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebb2db2051981e04dc7ab582b27b14904872aae3 --- /dev/null +++ b/fengshen/examples/stable_diffusion_dreambooth/requirements.txt @@ -0,0 +1,8 @@ +diffusers>==0.7.2 +torchvision +transformers>==4.24.0 +pytorch-lightning>==1.8.1 +ftfy +tensorboard +modelcards +deepspeed>==0.5.10 \ No newline at end of file diff --git a/fengshen/examples/stable_diffusion_dreambooth/train.py b/fengshen/examples/stable_diffusion_dreambooth/train.py new file mode 100644 index 0000000000000000000000000000000000000000..d783590e4ebb9e8069b6a5bebdd36f0be57309e6 --- /dev/null +++ b/fengshen/examples/stable_diffusion_dreambooth/train.py @@ -0,0 +1,276 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : train.py +@Time : 2022/11/09 22:27 +@Author : Gan Ruyi +@Version : 1.0 +@Contact : ganruyi@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' +import hashlib +import itertools +import os +from pathlib import Path +from tqdm.auto import tqdm +import torch +import argparse +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +from transformers import BertTokenizer, BertModel, CLIPTokenizer, CLIPTextModel +from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel +from torch.nn import functional as F +from fengshen.data.dreambooth_datasets.dreambooth_datasets import PromptDataset, DreamBoothDataset +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from fengshen.data.dreambooth_datasets.dreambooth_datasets import add_data_args + + +class StableDiffusionDreamBooth(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Taiyi Stable Diffusion Module') + parser.add_argument('--train_text_encoder', action='store_true', default=False) + # dreambooth train unet only default + parser.add_argument('--train_unet', action='store_true', default=True) + return parent_parser + + def __init__(self, args): + super().__init__() + if 'Taiyi-Stable-Diffusion-1B-Chinese-v0.1' in args.model_path: + self.tokenizer = BertTokenizer.from_pretrained( + args.model_path, subfolder="tokenizer") + self.text_encoder = BertModel.from_pretrained( + args.model_path, subfolder="text_encoder") # load from taiyi_finetune-v0 + else: + self.tokenizer = CLIPTokenizer.from_pretrained( + args.model_path, subfolder="tokenizer") + self.text_encoder = CLIPTextModel.from_pretrained( + args.model_path, subfolder="text_encoder") + self.vae = AutoencoderKL.from_pretrained( + args.model_path, subfolder="vae") + self.unet = UNet2DConditionModel.from_pretrained( + args.model_path, subfolder="unet") + self.noise_scheduler = DDPMScheduler.from_config( + args.model_path, subfolder="scheduler") + + # set model + self.vae.requires_grad_(False) + if not args.train_text_encoder: + self.requires_grad_(False) + if not args.train_unet: + self.requires_grad_(False) + + self.save_hyperparameters(args) + + def generate_extra_data(self): + global_rank = self.global_rank + device = self.trainer.device_ids[global_rank] + print('generate on device {} of global_rank {}'.format(device, global_rank)) + class_images_dir = Path(self.hparams.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < self.hparams.num_class_images: + pipeline = StableDiffusionPipeline.from_pretrained( + self.hparams.model_path, + safety_checker=None, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = self.hparams.num_class_images - cur_class_images + print(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(self.hparams.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=self.hparams.sample_batch_size) + + pipeline.to(device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=global_rank != 0 + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + # if torch.cuda.is_available(): + # torch.cuda.empty_cache() + + def setup(self, stage) -> None: + if self.hparams.with_prior_preservation: + self.generate_extra_data() + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + print('Total steps: {}' .format(self.total_steps)) + + def configure_optimizers(self): + model_params = [] + if self.hparams.train_unet and self.hparams.train_text_encoder: + model_params = itertools.chain(self.unet.parameters(), self.text_encoder.parameters()) + elif self.hparams.train_unet: + model_params = self.unet.parameters() + elif self.hparams.train_text_encoder: + model_params = self.text_encoder.parameters() + return configure_optimizers(self, model_params=model_params) + + def training_step(self, batch, batch_idx): + if self.hparams.train_text_encoder: + self.text_encoder.train() + if self.hparams.train_unet: + self.unet.train() + + latents = self.vae.encode(batch["pixel_values"]).latent_dist.sample() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn(latents.shape).to(latents.device) + noise = noise.to(dtype=self.unet.dtype) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) + timesteps = timesteps.long() + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + + noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps) + noisy_latents = noisy_latents.to(dtype=self.unet.dtype) + + # Get the text embedding for conditioning + # with torch.no_grad(): + encoder_hidden_states = self.text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample + + if self.hparams.with_prior_preservation: + # Chunk the noise and noise_pred into two parts and compute the loss on each part separately. + noise_pred, noise_pred_prior = torch.chunk(noise_pred, 2, dim=0) + noise, noise_prior = torch.chunk(noise, 2, dim=0) + # Compute instance loss + loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean() + # Compute prior loss + prior_loss = F.mse_loss(noise_pred_prior, noise_prior, reduction="mean") + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + loss = F.mse_loss(noise_pred, noise, reduction="mean") + self.log("train_loss", loss.item(), on_epoch=False, prog_bar=True, logger=True) + + if self.trainer.global_rank == 0: + if (self.global_step+1) % 5000 == 0: + print('saving model...') + pipeline = StableDiffusionPipeline.from_pretrained( + args.model_path, unet=self.unet, text_encoder=self.text_encoder, tokenizer=self.tokenizer, + ) + pipeline.save_pretrained(os.path.join( + args.default_root_dir, f'hf_out_{self.trainer.current_epoch}')) + + return {"loss": loss} + + def on_train_end(self) -> None: + if self.trainer.global_rank == 0: + print('saving model...') + pipeline = StableDiffusionPipeline.from_pretrained( + args.model_path, unet=self.unet, text_encoder=self.text_encoder, tokenizer=self.tokenizer, + ) + pipeline.save_pretrained(os.path.join( + args.default_root_dir, f'hf_out_{self.trainer.current_epoch}')) + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = add_data_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = StableDiffusionDreamBooth.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + model = StableDiffusionDreamBooth(args) + + tokenizer = model.tokenizer + datasets = DreamBoothDataset( + instance_data_dir=args.instance_data_dir, + instance_prompt=args.instance_prompt, + tokenizer=tokenizer, + class_data_dir=args.class_data_dir, + class_prompt=args.class_prompt, + size=512, + center_crop=args.center_crop, + ) + # construct the datasets to a dict for universal_datamodule + datasets = {'train': datasets} + + def collate_fn(examples): + # print(examples) + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if args.with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = tokenizer.pad( + {"input_ids": input_ids}, + padding="max_length", + max_length=tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + batch = { + "input_ids": input_ids, + "pixel_values": pixel_values, + } + + return batch + + datamodule = UniversalDataModule( + tokenizer=tokenizer, collate_fn=collate_fn, args=args, datasets=datasets) + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + trainer = Trainer.from_argparse_args(args, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + trainer.fit(model, datamodule, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/examples/stable_diffusion_dreambooth/train.sh b/fengshen/examples/stable_diffusion_dreambooth/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..ad3eb7ead394e6662168eb0b4947055277a01b58 --- /dev/null +++ b/fengshen/examples/stable_diffusion_dreambooth/train.sh @@ -0,0 +1,75 @@ +#!/bin/bash +#SBATCH --job-name=taiyi-sd-dreambooth # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=1 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen +ROOT_DIR=../../workspace +# export CUDA_VISIBLE_DEVICES='7' +export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions + +MODEL_NAME=taiyi-sd-dreambooth +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} +fi + +NNODES=1 +GPUS_PER_NODE=1 + +MICRO_BATCH_SIZE=1 +INSTANCE_PROMPT="小黄鸭" +OUTPUT_DIR="saved_model_tinyduck" +INSTANCE_DIR="train_images_duck" + +DATA_ARGS="\ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --instance_data_dir=$INSTANCE_DIR \ + --instance_prompt=$INSTANCE_PROMPT \ + --resolution=512 \ + " + +MODEL_ARGS="\ + --model_path $MODEL_ROOT_DIR/pretrain/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/ \ + --train_text_encoder \ + --learning_rate 1e-6 \ + --scheduler_type constant \ + --warmup_steps 100 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ + --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ + " + +TRAINER_ARGS="\ + --max_steps 1200 \ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy ddp \ + --log_every_n_steps 100 \ + --precision 32 \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --replace_sampler_ddp False \ + --num_sanity_val_steps 0 \ + --limit_val_batches 0 \ + " +# num_sanity_val_steps, limit_val_batches 通过这俩参数把validation关了 + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " +# run local +python train.py $options +# run on slurm +# srun python train.py $options \ No newline at end of file diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/02e791d8e91ddc2040e96675ab6873a.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/02e791d8e91ddc2040e96675ab6873a.jpg new file mode 100644 index 0000000000000000000000000000000000000000..79a9894f17d1a4e06a2e06ee1fab125d030008da Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/02e791d8e91ddc2040e96675ab6873a.jpg differ diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/ab1acecf23c6809a0fb12ffb169c795.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/ab1acecf23c6809a0fb12ffb169c795.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fca7dc4c48c4ae2e67942f304146227c6ea7a261 Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/ab1acecf23c6809a0fb12ffb169c795.jpg differ diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/f2595677df44dddae46f23578ea91e9.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/f2595677df44dddae46f23578ea91e9.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5a6aa286d6d55c3e6340457ac3e97fa3a7467b83 Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/f2595677df44dddae46f23578ea91e9.jpg differ diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/fa936e11c9f4419e91ad57d5041f739.jpg b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/fa936e11c9f4419e91ad57d5041f739.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7b29eaeb52beace4f09b198f0b362ecd27b59283 Binary files /dev/null and b/fengshen/examples/stable_diffusion_dreambooth/train_images_duck/fa936e11c9f4419e91ad57d5041f739.jpg differ diff --git a/fengshen/examples/stable_diffusion_dreambooth/train_with_prior.sh b/fengshen/examples/stable_diffusion_dreambooth/train_with_prior.sh new file mode 100644 index 0000000000000000000000000000000000000000..623972b04949ed5b81eb708f1f3b908907100db4 --- /dev/null +++ b/fengshen/examples/stable_diffusion_dreambooth/train_with_prior.sh @@ -0,0 +1,100 @@ +#!/bin/bash +#SBATCH --job-name=taiyi-sd-dreambooth # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks-per-node=2 # number of tasks to run per node +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH -o %x-%j.log # output and error log file names (%x for job id) +#SBATCH -x dgx050 + +# pwd=Fengshenbang-LM/fengshen/examples/pretrain_erlangshen +ROOT_DIR=../../workspace +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=${ROOT_DIR}/torch_extendsions + +MODEL_NAME=taiyi-sd-dreambooth-prior +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} +fi + +NNODES=1 +GPUS_PER_NODE=2 +MICRO_BATCH_SIZE=2 +# 如果你不用Deepspeed的话 下面的一段话都可以删掉 Begin +CONFIG_JSON="$MODEL_ROOT_DIR/${MODEL_NAME}.ds_config.json" +ZERO_STAGE=1 +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $CONFIG_JSON +{ + "zero_optimization": { + "stage": ${ZERO_STAGE} + }, + "fp16": { + "enabled": true + }, + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE +} +EOT +export PL_DEEPSPEED_CONFIG_PATH=$CONFIG_JSON +### End + +INSTANCE_PROMPT="[小黄鸭]" +OUTPUT_DIR="saved_model_duck2" +INSTANCE_DIR="train_images_duck" + +CLASS_PROMPT="小黄鸭" +CLASS_DIR="class_images_duck" + +DATA_ARGS="\ + --dataloader_workers 2 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --instance_data_dir=$INSTANCE_DIR \ + --instance_prompt=$INSTANCE_PROMPT \ + --class_prompt=$CLASS_PROMPT \ + --class_data_dir=$CLASS_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --num_class_images=200 \ + --resolution=512 \ + --sample_batch_size=1 \ + " + +MODEL_ARGS="\ + --model_path $MODEL_ROOT_DIR/pretrain/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/ \ + --train_text_encoder \ + --learning_rate 1e-6 \ + --scheduler_type constant \ + " + +MODEL_CHECKPOINT_ARGS="\ + --every_n_epochs 100 \ + --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ + --load_ckpt_path ${MODEL_ROOT_DIR}/ckpt/last.ckpt \ + " + +TRAINER_ARGS="\ + --max_epochs 200 \ + --gpus $GPUS_PER_NODE \ + --num_nodes $NNODES \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --log_every_n_steps 100 \ + --precision 16 \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --replace_sampler_ddp False \ + --num_sanity_val_steps 0 \ + --limit_val_batches 0 \ + " +# num_sanity_val_steps, limit_val_batches 通过这俩参数把validation关了 + +export options=" \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ + " +# run local +# python train.py $options +# run on slurm +srun python train.py $options \ No newline at end of file diff --git a/fengshen/examples/summary/pretrain_bart_summary.sh b/fengshen/examples/summary/pretrain_bart_summary.sh new file mode 100644 index 0000000000000000000000000000000000000000..f8a6af24f935cc563891922b8a50cd293231367b --- /dev/null +++ b/fengshen/examples/summary/pretrain_bart_summary.sh @@ -0,0 +1,124 @@ +#!/bin/bash +#SBATCH --job-name=bart_summary +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 # number of gpus +#SBATCH -o %x-%j.log + +set -x -e + +echo "START TIME: $(date)" +MODEL_NAME=bart-base +MICRO_BATCH_SIZE=16 +ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME} + +ZERO_STAGE=1 +export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions +config_json="./ds_config.${MODEL_NAME}.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 5e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-6, + "warmup_max_lr": 1e-4 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +# export PL_DEEPSPEED_CONFIG_PATH=$config_json + +TRAINER_ARGS=" + --max_epochs 2 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor val_loss \ + --mode min \ + --save_last \ + --every_n_train_steps 0 \ + --val_check_interval 0.1 \ +" + +prompt='"' +DATA_ARGS=" + --datasets_name lcsts \ + --num_workers 8 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --max_enc_length 128 \ + --max_dec_length 64 \ + --val_datasets_field val \ + --prompt $prompt \ +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/gaoxinyu/pretrained_model/bart-base \ + --output_save_path $ROOT_DIR/${MODEL_NAME}_predict_lcsts.json \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --precision 16 \ +" + +SCRIPTS_PATH=seq2seq_summary.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +#singularity exec --nv -B /cognitive_comp/ganruyi/Megatron/:/cognitive_comp/ganruyi/Megatron/,/cognitive_comp/gaoxinyu/:/cognitive_comp/gaoxinyu/ $SINGULARITY_PATH python $CMD + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" +# srun --nodes=1 --gres=gpu:4 --ntasks-per-node=4 --cpus-per-gpu=20 +source activate +conda activate torchnew +srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229623 bash -c 'python3 $SCRIPT_PATH $CMD' diff --git a/fengshen/examples/summary/randeng_pegasus_523M_summary.sh b/fengshen/examples/summary/randeng_pegasus_523M_summary.sh new file mode 100644 index 0000000000000000000000000000000000000000..10f6d29a6acd1fe70117d0f1b8d33ce58cdb1384 --- /dev/null +++ b/fengshen/examples/summary/randeng_pegasus_523M_summary.sh @@ -0,0 +1,143 @@ +#!/bin/bash +#SBATCH --job-name=randeng_pegasus_523M_summary +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=30 +#SBATCH -o %x-%j.log + +set -x -e + +echo "START TIME: $(date)" +MODEL_NAME=randeng_pegasus_523M_summary_last +MICRO_BATCH_SIZE=128 +ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME} + +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +output_save_path=$ROOT_DIR/${MODEL_NAME}.json +if [ -f ${output_save_path} ];then + echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!! + rm ${output_save_path} +fi + +ZERO_STAGE=1 + +config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 1000, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 5e-5, + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_min_lr": 1e-8, + "warmup_max_lr": 1e-4, + "total_num_steps": 60000, + "warmup_num_steps" : 1000 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions +# export MASTER_PORT=$[RANDOM%10000+50000] +# +# --strategy deepspeed_stage_${ZERO_STAGE} \ +TRAINER_ARGS=" + --max_epochs 10 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor val_loss \ + --mode min \ + --save_last \ + --every_n_train_steps 10000 \ + --val_check_interval 0.1 \ +" +prompt='"' +DATA_ARGS=" + --datasets_name lcsts \ + --num_workers 30 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --max_enc_length 128 \ + --max_dec_length 64 \ + --val_datasets_field val \ + --prompt $prompt \ +" + +# --prompt $prompt \ +# --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_77M_summary/ckpt/hf_pretrained_epoch1_step75019 \ + +# mode_path="/cognitive_comp/dongxiaoqun/train_model/fengshen-pegasus-base/ckpt/hf_pretrained_epoch0_step22200/" +mode_path="/cognitive_comp/dongxiaoqun/train_model/fengshen-pegasus-large/ckpt/hf_pretrained_epoch0_step122000" +cp /cognitive_comp/dongxiaoqun/pretrained_model/pegasus-large/vocab.txt $mode_path/ + +MODEL_ARGS=" + --pretrained_model_path $mode_path \ + --output_save_path $output_save_path \ + --self_tokenizer \ +" + +SCRIPTS_PATH=/cognitive_comp/dongxiaoqun/debug/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +source activate +conda activate torchnew +srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229555 bash -c 'python3 $SCRIPT_PATH $CMD' + diff --git a/fengshen/examples/summary/randeng_t5_70M_summary.sh b/fengshen/examples/summary/randeng_t5_70M_summary.sh new file mode 100644 index 0000000000000000000000000000000000000000..403d8d4dd022bf90fe9f50854291ec4e48f13aff --- /dev/null +++ b/fengshen/examples/summary/randeng_t5_70M_summary.sh @@ -0,0 +1,128 @@ +#!/bin/bash +#SBATCH --job-name=randeng_t5_77M_summary +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --gres=gpu:2 # number of gpus +#SBATCH --cpus-per-task=30 +#SBATCH -o %x-%j.log + +set -x -e + +echo "START TIME: $(date)" +MODEL_NAME=randeng_t5_77M_summary_test2 +MICRO_BATCH_SIZE=64 +ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME} +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +output_save_path=$ROOT_DIR/${MODEL_NAME}.json +if [ -f ${output_save_path} ];then + echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!! + rm ${output_save_path} +fi +ZERO_STAGE=1 + +config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 60000, + "warmup_num_steps" : 500 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions +# export MASTER_PORT=$[RANDOM%10000+30000] +# export PL_FAULT_TOLERANT_TRAINING=1 + +TRAINER_ARGS=" + --max_epochs 2 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor val_loss \ + --mode min \ + --save_last \ + --every_n_train_steps 0 \ + --val_check_interval 0.1 \ +" + +prompt="summary:" +DATA_ARGS=" + --datasets_name lcsts \ + --num_workers 30 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --max_enc_length 128 \ + --max_dec_length 64 \ + --val_datasets_field val \ + --prompt $prompt \ +" +# --prompt $prompt \ +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_77M/ckpt/hf_pretrained_epoch0_step183100 \ + --output_save_path $ROOT_DIR/randeng_t5_77M_predict_lcsts.json \ +" + +SCRIPTS_PATH=/cognitive_comp/dongxiaoqun/debug/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " +echo $CMD +# python $CMD + +source activate +conda activate torchnew +srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229623 bash -c 'python3 $SCRIPT_PATH $CMD' diff --git a/fengshen/examples/summary/randeng_t5_70M_summary_predict.sh b/fengshen/examples/summary/randeng_t5_70M_summary_predict.sh new file mode 100644 index 0000000000000000000000000000000000000000..ccbf410fa92b1d5e09c97d6ae3af7bb4ff121c64 --- /dev/null +++ b/fengshen/examples/summary/randeng_t5_70M_summary_predict.sh @@ -0,0 +1,138 @@ +#!/bin/bash +#SBATCH --job-name=randeng_t5_77M_summary_predict +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --gres=gpu:2 # number of gpus +#SBATCH --cpus-per-task=30 +#SBATCH -o %x-%j.log + +set -x -e + +echo "START TIME: $(date)" +MODEL_NAME=randeng_t5_77M_summary_predict +MICRO_BATCH_SIZE=16 +ROOT_DIR=/cognitive_comp/ganruyi/experiments/${MODEL_NAME} +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +output_save_path=$ROOT_DIR/randeng_t5_77M_predict_lcsts.json +if [ -f ${output_save_path} ];then + echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!! + rm ${output_save_path} +fi + +ZERO_STAGE=1 + +config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-8, + "weight_decay": 5e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-6, + "warmup_max_lr": 1e-4 + } + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions +export MASTER_PORT=$[RANDOM%10000+50000] + +# --strategy deepspeed_stage_${ZERO_STAGE} \ +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 2 \ + --num_nodes 1 \ + --strategy ddp \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor train_loss \ + --mode min \ + --save_last \ + --every_n_train_steps 0 \ +" +DATA_DIR=/cognitive_comp/ganruyi/data_datasets_LCSTS_LCSTS/ +prompt="summary:" +DATA_ARGS=" + --datasets_name lcsts \ + --num_workers 30 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --max_enc_length 128 \ + --max_dec_length 64 \ + --val_datasets_field val \ + --prompt $prompt \ +" +# --prompt $prompt \ +# --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_77M_summary/ckpt/hf_pretrained_epoch1_step75019 \ + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/gaoxinyu/pretrained_model/bart-759M \ + --output_save_path $ROOT_DIR/randeng_t5_77M_predict_lcsts.json \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --precision 16 \ + --warmup 0.01 \ + --do_eval_only \ + --max_dec_length 32 \ +" + +SCRIPTS_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " +echo $CMD +source activate base +# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' +python $CMD \ No newline at end of file diff --git a/fengshen/examples/summary/randeng_t5_784M_summary.sh b/fengshen/examples/summary/randeng_t5_784M_summary.sh new file mode 100644 index 0000000000000000000000000000000000000000..5b3e60c8784ac563eff09763591e00b6d250444f --- /dev/null +++ b/fengshen/examples/summary/randeng_t5_784M_summary.sh @@ -0,0 +1,130 @@ +#!/bin/bash +#SBATCH --job-name=randeng_t5_77M_summary +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=2 +#SBATCH --gres=gpu:2 # number of gpus +#SBATCH --cpus-per-task=30 +#SBATCH -o %x-%j.log + +set -x -e + +echo "START TIME: $(date)" +MODEL_NAME=randeng_t5_784M_summary +MICRO_BATCH_SIZE=8 +ROOT_DIR=/cognitive_comp/dongxiaoqun/finetune/${MODEL_NAME} +if [ ! -d ${ROOT_DIR} ];then + mkdir ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +ZERO_STAGE=1 + +config_json="${ROOT_DIR}/ds_config.${MODEL_NAME}.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 100, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 50000000, + "allgather_bucket_size": 500000000 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-4, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "params": { + "warmup_max_lr": 1e-04, + "warmup_min_lr": 1e-05, + "total_num_steps": 60000, + "warmup_num_steps" : 500 + }, + "type": "WarmupDecayLR" + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/dongxiaoqun/torch_extendsions +# export MASTER_PORT=$[RANDOM%10000+30000] +# export PL_FAULT_TOLERANT_TRAINING=1 + +TRAINER_ARGS=" + --max_epochs 1 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor val_loss \ + --mode min \ + --save_last \ + --every_n_train_steps 0 \ + --val_check_interval 0.1 \ +" + +prompt="summary:" +DATA_ARGS=" + --datasets_name lcsts \ + --num_workers 30 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --max_enc_length 128 \ + --max_dec_length 64 \ + --val_datasets_field val \ + --prompt $prompt \ +" +# --prompt $prompt \ +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/ganruyi/experiments/randeng_t5_large_v2/ckpt/hf_pretrained_epoch0_step732500 \ + --output_save_path $ROOT_DIR/randeng_t5_784M_predict_lcsts.json \ +" + +SCRIPTS_PATH=/cognitive_comp/dongxiaoqun/debug/Fengshenbang-LM/fengshen/examples/summary/seq2seq_summary.py +SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " +echo $CMD + +source activate +conda activate torchnew +srun --nodes=1 --ntasks-per-node=1 --gres=gpu:1 --cpus-per-task=30 -o ${MODEL_NAME}-%J.log --jobid=229668 bash -c 'python3 $SCRIPT_PATH $CMD' +# source activate base +# python $CMD + +# srun --jobid=229668 --nodes=1 --gres=gpu:1 --ntasks-per-node=1 --cpus-per-task=30 -e ${ROOT_DIR}/${MODEL_NAME}-%j.err -o ${ROOT_DIR}/${MODEL_NAME}-%j.log singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' + +# srun python $CMD +# srun singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c '/home/ganruyi/anaconda3/bin/python $CMD' diff --git a/fengshen/examples/summary/seq2seq_summary.py b/fengshen/examples/summary/seq2seq_summary.py new file mode 100644 index 0000000000000000000000000000000000000000..c0c725c215d61dc5c6fa0fbf6603b7f06f0a317b --- /dev/null +++ b/fengshen/examples/summary/seq2seq_summary.py @@ -0,0 +1,197 @@ + +import torch +import os +import argparse +import json +import pytorch_lightning as pl +from fengshen.models.model_utils import add_module_args +from fengshen.data.task_dataloader.task_datasets import AbstractCollator +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +from fengshen.utils.utils import chinese_char_tokenize +from torchmetrics.text.rouge import ROUGEScore +from pytorch_lightning import Trainer, loggers +from pytorch_lightning.callbacks import LearningRateMonitor +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +import sys +sys.path.append('../../../') + + +# os.environ["CUDA_VISIBLE_DEVICES"] = '3,4' + + +class FinetuneSummary(pl.LightningModule): + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--rouge_keys', default='rougeL,rouge1,rouge2', type=str) + return parent_args + + def __init__(self, args, tokenizer=None): + super().__init__() + self.save_hyperparameters(args) + self.model = AutoModelForSeq2SeqLM.from_pretrained( + args.pretrained_model_path) + self.tokenizer = tokenizer + assert self.tokenizer, "tokenizer is None!" + self.rouge_keys = tuple(args.rouge_keys.split(',')) + self.rouge_metric = ROUGEScore(rouge_keys=self.rouge_keys, normalizer=lambda x: x) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus) + ab_size = self.trainer.accumulate_grad_batches * \ + float(self.trainer.max_epochs) + self.total_steps = ( + len(train_loader.dataset) // tb_size) // ab_size + print('total_steps is :', self.total_steps) + + def training_step(self, batch, batch_idx): + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], labels=batch['labels']) + self.log('train_loss', output.loss, sync_dist=True) + return output.loss + + def on_validation_start(self) -> None: + # rm file at validation start + prefix, ext = os.path.splitext(self.hparams.output_save_path) + file_path_rank = '{}_{}{}'.format( + prefix, self.trainer._accelerator_connector.cluster_environment.global_rank(), ext) + if os.path.exists(file_path_rank): + print('rm {}'.format(file_path_rank)) + os.remove(file_path_rank) + + def validation_step(self, batch, batch_idx): + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], labels=batch['labels']) + generated_ids = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + max_length=self.hparams.max_dec_length + ) + + preds = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + labels = torch.where(batch['labels'] != -100, batch['labels'], + self.tokenizer.pad_token_id) + labels = self.tokenizer.batch_decode( + labels, skip_special_tokens=True, clean_up_tokenization_spaces=True) + # save preds for every rank + prefix, ext = os.path.splitext(self.hparams.output_save_path) + file_path_rank = '{}_{}{}'.format( + prefix, self.trainer._accelerator_connector.cluster_environment.global_rank(), ext) + self.save_prediction_to_file(preds=preds, texts=batch['text'], + summarys=batch['summary'], file_path=file_path_rank) + # you need to split chinese char with space for rouge metric + new_preds = [chinese_char_tokenize(p) for p in preds] + new_labels = [chinese_char_tokenize(label) for label in labels] + # update metric + self.rouge_metric.update(preds=new_preds, target=new_labels) + self.log('val_loss', output.loss, sync_dist=True) + + def validation_epoch_end(self, outputs): + # compute metric for all process + rouge_dict = self.rouge_metric.compute() + # reset the metric after once validation + self.rouge_metric.reset() + for k, v in rouge_dict.items(): + self.log('val_{}'.format(k), v, sync_dist=True) + if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0: + print('rouge:\n', rouge_dict) + + def on_save_checkpoint(self, checkpoint) -> None: + if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0: + self.model.save_pretrained(os.path.join( + self.trainer.checkpoint_callback.dirpath, + 'hf_pretrained_epoch{}_step{}'.format(checkpoint['epoch'], checkpoint['global_step']))) + + def save_prediction_to_file(self, preds, texts, summarys, file_path): + with open(file_path, 'a', encoding='utf-8') as f: + for idx, pred in enumerate(preds): + text = texts[idx] + summary = summarys[idx] + tmp_result = dict() + tmp_result['pred'] = pred + tmp_result['label'] = summary + tmp_result['text'] = text + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data + '\n') + + def predict_step(self, batch, batch_idx): + # print(batch) + texts = batch['text'] + # output summary and metrics + generated_ids = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + max_length=self.hparams.max_dec_length + ) + preds = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + labels = self.tokenizer.batch_decode( + batch['labels'], skip_special_tokens=True, clean_up_tokenization_spaces=True) + print(batch_idx, len(preds), len(labels)) + self.save_prediction_to_file(preds, texts, labels) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + +def main(): + total_parser = argparse.ArgumentParser("Summary Task") + total_parser.add_argument('--do_eval_only', + action='store_true', + default=False) + total_parser.add_argument('--pretrained_model_path', + default='google/mt5-small', + type=str) + total_parser.add_argument('--output_save_path', + default='./predict.json', + type=str) + total_parser.add_argument('--self_tokenizer', + action='store_true', + default=False) + total_parser.add_argument('--max_enc_length', default=1024, type=int) + total_parser.add_argument('--max_dec_length', default=256, type=int) + total_parser.add_argument('--prompt', default='summarize:', type=str) + # * Args for data preprocessing + # from fengshen.data.task_dataloader.task_datasets import LCSTSDataModel + total_parser = UniversalDataModule.add_data_specific_args(total_parser) + # * Args for training + total_parser = add_module_args(total_parser) + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = UniversalCheckpoint.add_argparse_args(total_parser) + total_parser = FinetuneSummary.add_model_specific_args(total_parser) + # * Args for base model + args = total_parser.parse_args() + + if args.self_tokenizer: + from fengshen.examples.pegasus.tokenizers_pegasus import PegasusTokenizer + tokenizer = PegasusTokenizer.from_pretrained(args.pretrained_model_path) + else: + tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model_path, use_fast=False) + collator = AbstractCollator(tokenizer, args.max_enc_length, + args.max_dec_length, args.prompt) + data_model = UniversalDataModule(tokenizer=tokenizer, args=args, collate_fn=collator) + model = FinetuneSummary(args, tokenizer) + if not args.do_eval_only: + lr_monitor = LearningRateMonitor(logging_interval='step') + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'log/')) + checkpoint_callback = UniversalCheckpoint(args) + trainer = Trainer.from_argparse_args(args, + logger=logger, + callbacks=[lr_monitor, + checkpoint_callback] + ) + trainer.fit(model, data_model) + else: + trainer = Trainer.from_argparse_args(args) + # trainer.predict(model, data_model) + trainer.validate(model, data_model) + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/tcbert/README.md b/fengshen/examples/tcbert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a6f6b38e2b9cc6978962927bb0e8568b46da28f0 --- /dev/null +++ b/fengshen/examples/tcbert/README.md @@ -0,0 +1,145 @@ +[**中文**](./README.md) + +# TCBert +论文 《[TCBERT: A Technical Report for Chinese Topic Classification BERT](https://arxiv.org/abs/2211.11304)》源码 + +## Requirements + +安装 fengshen 框架 + +```shell +git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git +cd Fengshenbang-LM +pip install --editable . +``` + +## Quick Start + +你可以参考我们的 [example.py](./example.py) 脚本,只需要将处理好的 ```train_data```、```dev_data```、```test_data```、 ```prompt```、```prompt_label``` ,输入模型即可。 +```python +import argparse +from fengshen.pipelines.tcbert import TCBertPipelines +from pytorch_lightning import seed_everything + +total_parser = argparse.ArgumentParser("Topic Classification") +total_parser = TCBertPipelines.piplines_args(total_parser) +args = total_parser.parse_args() + +pretrained_model_path = 'IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese' +args.learning_rate = 2e-5 +args.max_length = 512 +args.max_epochs = 3 +args.batchsize = 1 +args.train = 'train' +args.default_root_dir = './' +# args.gpus = 1 #注意:目前使用CPU进行训练,取消注释会使用GPU,但需要配置相应GPU环境版本 +args.fixed_lablen = 2 #注意:可以设置固定标签长度,由于样本对应的标签长度可能不一致,建议选择合适的数值表示标签长度 + +train_data = [ + {"content": "凌云研发的国产两轮电动车怎么样,有什么惊喜?", "label": "科技",} + ] + +dev_data = [ + {"content": "我四千一个月,老婆一千五一个月,存款八万且有两小孩,是先买房还是先买车?","label": "汽车",} +] + +test_data = [ + {"content": "街头偶遇2018款长安CS35,颜值美炸!或售6万起,还买宝骏510?"} +] + +prompt = "下面是一则关于{}的新闻:" + +prompt_label = {"汽车":"汽车", "科技":"科技"} + +model = TCBertPipelines(args, model_path=pretrained_model_path, nlabels=len(prompt_label)) + +if args.train: + model.train(train_data, dev_data, prompt, prompt_label) +result = model.predict(test_data, prompt, prompt_label) +``` + + +## Pretrained Model +为了提高模型在话题分类上的效果,我们收集了大量话题分类数据进行基于`prompt`的预训练。我们已经将预训练模型开源到 ```HuggingFace``` 社区当中。 + +| 模型 | 地址 | +|:---------:|:--------------:| +| Erlangshen-TCBert-110M-Classification-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese) | +| Erlangshen-TCBert-330M-Classification-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Classification-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Classification-Chinese) | +| Erlangshen-TCBert-1.3B-Classification-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Classification-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Classification-Chinese) | +| Erlangshen-TCBert-110M-Sentence-Embedding-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Sentence-Embedding-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-110M-Sentence-Embedding-Chinese) | +| Erlangshen-TCBert-330M-Sentence-Embedding-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Sentence-Embedding-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-330M-Sentence-Embedding-Chinese) | +| Erlangshen-TCBert-1.3B-Sentence-Embedding-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Sentence-Embedding-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-TCBert-1.3B-Sentence-Embedding-Chinese) | + +## Experiments + +对每个不同的数据集,选择合适的模板```Prompt``` +Dataset | Prompt +|------------|------------| +| TNEWS | 下面是一则关于{}的新闻: | +| CSLDCP | 这一句描述{}的内容如下: | +| IFLYTEK | 这一句描述{}的内容如下: | + +使用上述```Prompt```的实验结果如下: +| Model | TNEWS | CLSDCP | IFLYTEK | +|------------|------------|----------|-----------| +| Macbert-base | 55.02 | 57.37 | 51.34 | +| Macbert-large | 55.77 | 58.99 | 50.31 | +| Erlangshen-1.3B | 57.36 | 62.35 | 53.23 | +| TCBert-base-110M-Classification-Chinese | 55.57 | 58.60 | 49.63 | +| TCBert-large-330M-Classification-Chinese | 56.17 | 61.23 | 51.34 | +| TCBert-1.3B-Classification-Chinese | 57.41 | 65.10 | 53.75 | +| TCBert-base-110M-Sentence-Embedding-Chinese | 54.68 | 59.78 | 49.40 | +| TCBert-large-330M-Sentence-Embedding-Chinese | 55.32 | 62.07 | 51.11 | +| TCBert-1.3B-Sentence-Embedding-Chinese | 57.46 | 65.04 | 53.06 | + +## Dataset + +需要您提供:```训练集```、```验证集```、```测试集```、```Prompt```、```标签映射```五个数据,对应的数据格式如下: + +#### 训练数据 示例 +必须包含```content```和```label```字段 +```json +[{ + "content": "街头偶遇2018款长安CS35,颜值美炸!或售6万起,还买宝骏510?", + "label": "汽车" +}] +``` + +#### 验证数据 示例 +必须包含```content```和```label```字段 +```json +[{ + "content": "宁夏邀深圳市民共赴“寻找穿越”之旅", + "label": "旅游" +}] +``` + +#### 测试数据 示例 +必须包含```content```字段 +```json +[{ + "content": "买涡轮增压还是自然吸气车?今天终于有答案了!" +}] +``` +#### Prompt 示例 +可以选择任一模版,模版的选择会对模型效果产生影响,其中必须包含```{}```,作为标签占位符 +```json +"下面是一则关于{}的新闻:" +``` + +#### 标签映射 示例 +可以将真实标签映射为更合适Prompt的标签,支持映射后的标签长度不一致 +```json +{ + "汽车": "汽车", + "旅游": "旅游", + "经济生活": "经济生活", + "房产新闻": "房产" +} +``` + +## License + +[Apache License 2.0](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/LICENSE) + diff --git a/fengshen/examples/tcbert/__init__.py b/fengshen/examples/tcbert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fengshen/examples/tcbert/example.py b/fengshen/examples/tcbert/example.py new file mode 100644 index 0000000000000000000000000000000000000000..5eff218461c65f40ec88e9ea2c7e0cdbe1d05082 --- /dev/null +++ b/fengshen/examples/tcbert/example.py @@ -0,0 +1,86 @@ +import argparse +from fengshen.pipelines.tcbert import TCBertPipelines +from pytorch_lightning import seed_everything + +def main(): + seed_everything(123) + total_parser = argparse.ArgumentParser("Topic Classification") + total_parser = TCBertPipelines.piplines_args(total_parser) + args = total_parser.parse_args() + + pretrained_model_path = 'IDEA-CCNL/Erlangshen-TCBert-110M-Classification-Chinese' + args.learning_rate = 2e-5 + args.max_length = 512 + args.max_epochs = 5 + args.batchsize = 4 + args.train = 'train' + args.default_root_dir = './' + # args.gpus = 1 #注意:目前使用CPU进行训练,取消注释会使用GPU,但需要配置相应GPU环境版本 + args.fixed_lablen = 2 #注意:可以设置固定标签长度,由于样本对应的标签长度可能不一致,建议选择适中的数值表示标签长度 + + train_data = [ # 训练数据 + {"content": "真正的放养教育,放的是孩子的思维,养的是孩子的习惯", "label": "故事"}, + {"content": "《唐人街探案》捧红了王宝强跟刘昊然,唯独戏份不少的他发展最差", "label": "娱乐"}, + {"content": "油价攀升 阿曼经济加速增长", "label": "财经"}, + {"content": "日本男篮近期动作频频,中国队的未来劲敌会是他们吗?", "label": "体育"}, + {"content": "教育部:坚决防止因撤并乡村小规模学校导致学生上学困难", "label": "教育"}, + {"content": "LOL设计最完美的三个英雄,玩家们都很认可!", "label": "电竞"}, + {"content": "上联:浅看红楼终是梦,怎么对下联?", "label": "文化"}, + {"content": "楼市再出新政!北京部分限房价项目或转为共有产权房", "label": "房产"}, + {"content": "企业怎样选云服务器?云服务器哪家比较好?", "label": "科技"}, + {"content": "贝纳利的三缸车TRE899K、TRE1130K华丽转身", "label": "汽车"}, + {"content": "如何评价:刘姝威的《严惩做空中国股市者》?", "label": "股票"}, + {"content": "宁夏邀深圳市民共赴“寻找穿越”之旅", "label": "旅游"}, + {"content": "日本自民党又一派系力挺安倍 称会竭尽全力", "label": "国际"}, + {"content": "农村养老保险每年交5000,交满15年退休后能每月领多少钱?", "label": "农业"}, + {"content": "国产舰载机首次现身,进度超过预期,将率先在滑跃航母测试", "label": "军事"} + ] + + dev_data = [ # 验证数据 + {"content": "西游记后传中,灵儿最爱的女人是谁?不是碧游!", "label": "故事"}, + {"content": "小李子莱奥纳多有特别的提袋子技能,这些年他还有过哪些神奇的造型?", "label": "娱乐"}, + {"content": "现在手上有钱是投资买房还是存钱,为什么?", "label": "财经"}, + {"content": "迪卡侬的衣服值得购买吗?", "label": "体育"}, + {"content": "黑龙江省旅游委在齐齐哈尔组织举办导游培训班", "label": "教育"}, + {"content": "《王者荣耀》中,哪些英雄的大招最“废柴”?", "label": "电竞"}, + {"content": "上交演绎马勒《复活》,用音乐带来抚慰和希望", "label": "文化"}, + {"content": "All in服务业,58集团在租房、住房市场的全力以赋", "label": "房产"}, + {"content": "为什么有的人宁愿选择骁龙660的X21,也不买骁龙845的小米MIX2S?", "label": "科技"}, + {"content": "众泰大型SUV来袭,售13.98万,2.0T榨出231马力,汉兰达要危险了", "label": "汽车"}, + {"content": "股票放量下趺,大资金出逃谁在接盘?", "label": "股票"}, + {"content": "广西博白最大的特色是什么?", "label": "旅游"}, + {"content": "特朗普退出《伊朗核协议》,对此你怎么看?", "label": "国际"}, + {"content": "卖水果利润怎么样?", "label": "农业"}, + {"content": "特种兵都是身材高大的猛男么?别再被电视骗了,超过1米8都不合格", "label": "军事"} + ] + + test_data = [ # 测试数据 + {"content": "廖凡重出“江湖”再争影帝 亮相戛纳红毯霸气有型"}, + {"content": "《绝地求生: 刺激战场》越玩越卡?竟是手机厂商没交“保护费”!"}, + {"content": "买涡轮增压还是自然吸气车?今天终于有答案了!"}, + ] + + #标签映射 将真实标签可以映射为更合适prompt的标签 + prompt_label = { + "体育":"体育", "军事":"军事", "农业":"农业", "国际":"国际", + "娱乐":"娱乐", "房产":"房产", "故事":"故事", "教育":"教育", + "文化":"文化", "旅游":"旅游", "汽车":"汽车", "电竞":"电竞", + "科技":"科技", "股票":"股票", "财经":"财经" + } + + #不同的prompt会影响模型效果 + #prompt = "这一句描述{}的内容如下:" + prompt = "下面是一则关于{}的新闻:" + + model = TCBertPipelines(args, model_path=pretrained_model_path, nlabels=len(prompt_label)) + + if args.train: + model.train(train_data, dev_data, prompt, prompt_label) + result = model.predict(test_data, prompt, prompt_label) + + for i, line in enumerate(result): + print({"content":test_data[i]["content"], "label":list(prompt_label.keys())[line]}) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/translate/README.md b/fengshen/examples/translate/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fec3d5c0142aa08375927aba65bbd7c60187ff06 --- /dev/null +++ b/fengshen/examples/translate/README.md @@ -0,0 +1,65 @@ +# translation examples +## 数据预处理 + +数据预处理部分目前整合不多,主要提供一个最终的转换文件,转换成模型应用格式,前期还是应用mose等工具进行数据的预处理部分,产出处理后的源目标语言和目标语言两份数据,再调用本脚本合并 + +前期数据预处理脚本可参考,deltalm在fairseq中的demo,prepare_iwslt14.sh:https://github.com/microsoft/unilm/blob/master/deltalm/examples/prepare_iwslt14.sh) + +### 目标格式 +需要将翻译的源语言和目标语言转换到一个文件中,格式如下: +src为源语言,tgt为目标语言,每一行都是一个json格式 +``` +{"src": "und was menschliche gesundheit ist , kann auch ziemlich kompliziert sein .", "tgt": "and it can be a very complicated thing , what human health is ."} +{"src": "nun , warum spielt das eine rolle für die menschliche gesundheit ?", "tgt": "now why does that matter for human health ?"} +{"src": "das ist ein bild der cannery row von 1932 .", "tgt": "this is a shot of cannery row in 1932 ."} +``` +### 处理脚本 + +目前的finetue数据主要是通过deltalm的提供的实现,通过脚本转换成封神数据格式 + +当前的转换脚本只是简单的将源语言和目标语言合并到一个文件,并生成上述格式,后续会继续完善处理脚本 + +脚本路径:Fengshenbang-LM/fengshen/examples/translate/prepare_dataset.py + + +使用方式: +``` +python prepare_dataset.py processed_data_path de-en +``` + +## deltalm 模型 + +### deltalm模型路径 +1) https://huggingface.co./IDEA-CCNL/Randeng-Deltalm-362M-En-Zn
+2) https://huggingface.co./IDEA-CCNL/Randeng-Deltalm-362M-Zh-En + +主要包含三个文件: +config.json:模型配置文件 +pytorch_model.bin:模型文件 +spm.model:sentence_piece文件 + +### deltalm 模型结构 +均实现在 Fengshenbang-LM/fengshen/models/deltalm 路径下,文件结构如下: +1) modeling_deltalm.py 实现模型的基本结构,结构如论文所示 +2) tokenizer_deltalm.py 实现模型的tokenzier部分 +3) configuration_deltalm.py 实现模型的config配置部分 + +### finetune 德译英示例 +主要实现代码在 Fengshenbang-LM/fengshen/examples/translate/finetune_deltalm.py +通过脚本调用即可, 参考脚本 Fengshenbang-LM/fengshen/examples/translate/finetune_deltalm.sh + +使用示例: +``` +bash -x finetune_deltalm.sh +``` + +注:如果要使用label_smoothing,当前需要设置label_smoothing参数不为0,当前默认值为0.1。直接在finetune_deltalm.sh里修改参数值即可 + +## 运行环境 + +pyhton = 3.8.10 +pytorch = 1.10.0 +transformers = 4.20.1 +pytorch-lightning = 1.6.5 + +相关环境安装可参考Wiki:http://wiki.team.idea.edu.cn/pages/viewpage.action?pageId=16291924 diff --git a/fengshen/examples/translate/finetune_deltalm.py b/fengshen/examples/translate/finetune_deltalm.py new file mode 100644 index 0000000000000000000000000000000000000000..d19dd1ca4a5f920dcb90863e89940f05362e2cda --- /dev/null +++ b/fengshen/examples/translate/finetune_deltalm.py @@ -0,0 +1,449 @@ +# !/usr/bin/env python +# -*- coding: utf-8 -*- +import pandas as pd +import json +import argparse +import torch +import os +import logging +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +from pytorch_lightning.utilities import rank_zero_info +from sacrebleu.metrics import BLEU +from fengshen.utils.utils import chinese_char_tokenize +from fengshen.models.model_utils import add_module_args, add_inverse_square_args +from fengshen.models.deltalm.tokenizer_deltalm import DeltalmTokenizer +from fengshen.models.deltalm.modeling_deltalm import DeltalmForConditionalGeneration +from fengshen.utils import UniversalCheckpoint +from fengshen.data.universal_datamodule import UniversalDataModule +from pytorch_lightning import Trainer, loggers, LightningModule +from pytorch_lightning.callbacks import LearningRateMonitor +from mosestokenizer import MosesDetokenizer +from typing import List +import sys +sys.path.append('../../../') + +# from transformers import MBartForConditionalGeneration, MBart50TokenizerFast +# from pytorch_lightning.callbacks.early_stopping import EarlyStopping + + +mose_decode = MosesDetokenizer() + +os.environ["CUDA_VISIBLE_DEVICES"] = '4' +logger = logging.getLogger(__name__) + +EVAL_BLEU_ORDER = 4 + + +def calc_bleu_from_stats(sentence_stats: pd.DataFrame) -> BLEU: + corpus_stats = sentence_stats.sum(axis=0) + smooth = {"smooth_method": "exp"} + corpus_bleu = BLEU.compute_bleu( + correct=[ + corpus_stats.correct_1_grams, + corpus_stats.correct_2_grams, + corpus_stats.correct_3_grams, + corpus_stats.correct_4_grams, + ], + total=[ + corpus_stats.total_1_grams, + corpus_stats.total_2_grams, + corpus_stats.total_3_grams, + corpus_stats.total_4_grams, + ], + sys_len=corpus_stats.translation_length, + ref_len=corpus_stats.reference_length, + **smooth + ) + return corpus_bleu + + +def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True): + if target.dim() == lprobs.dim() - 1: + target = target.unsqueeze(-1) + # logger.debug("Debug: After target.dim() == lprobs.dim(): ", target.dim(), lprobs.dim()) + nll_loss = -lprobs.gather(dim=-1, index=target) + smooth_loss = -lprobs.sum(dim=-1, keepdim=True) + if ignore_index is not None: + pad_mask = target.eq(ignore_index) + nll_loss.masked_fill_(pad_mask, 0.0) + smooth_loss.masked_fill_(pad_mask, 0.0) + else: + nll_loss = nll_loss.squeeze(-1) + smooth_loss = smooth_loss.squeeze(-1) + if reduce: + nll_loss = nll_loss.sum() + smooth_loss = smooth_loss.sum() + eps_i = epsilon / (lprobs.size(-1) - 1) + valid_length = target.ne(ignore_index).sum() + # unvalid_length = target.eq(ignore_index).sum() + loss = ((1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss) / valid_length.item() + + return loss, nll_loss + + +class DataCollator: + def __init__(self, model, tokenizer, max_enc_length, max_dec_length, reverse_src_tgt): + self.tokenizer = tokenizer + self.max_enc_length = max_enc_length + self.max_dec_length = max_dec_length + self.model = model + self.reverse_src_tgt = reverse_src_tgt + + def __call__(self, batch_samples): + batch_inputs, batch_targets = [], [] + for sample in batch_samples: + if self.reverse_src_tgt: + if "tgt" in sample and len(sample["tgt"]) != 0: + batch_inputs.append(sample["tgt"]) + batch_targets.append(sample["src"]) + else: + if "src" in sample and len(sample["src"]) != 0: + batch_inputs.append(sample["src"]) + batch_targets.append(sample["tgt"]) + batch_data = self.tokenizer( + batch_inputs, + padding='max_length', + max_length=self.max_enc_length, + truncation=True, + return_tensors="pt" + ) + with self.tokenizer.as_target_tokenizer(): + labels = self.tokenizer( + batch_targets, + padding='max_length', + max_length=self.max_dec_length, + truncation=False, + return_tensors="pt" + )["input_ids"] + batch_data['decoder_input_ids'] = self.model.prepare_decoder_input_ids_from_labels(labels) + batch_data['labels'] = labels + + batch_data['src'] = batch_inputs + batch_data['tgt'] = batch_targets + + # logger.debug(batch_data) + return batch_data + + +class FinetuneTranslation(LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('deltalm-base finetune') + parser.add_argument('--label_smoothing', default=0.1, type=float) + return parent_args + + def __init__(self, args, tokenizer=None): + super().__init__() + self.args = args + self.save_hyperparameters(args) + if args.other_model: + self.model = AutoModelForSeq2SeqLM.from_pretrained(args.model_path) + else: + self.model = DeltalmForConditionalGeneration.from_pretrained(args.model_path, ignore_mismatched_sizes=True) + self.tokenizer = tokenizer + assert self.tokenizer, "tokenizer is None!" + self.blue_metric = BLEU() + self.sufficient_stats: List[List[int]] = [] + self.label_smoothing = self.args.label_smoothing + self.mose_decode = MosesDetokenizer() + + if self.args.label_smoothing != 0: + self.loss_fn = label_smoothed_nll_loss + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus) + ab_size = self.trainer.accumulate_grad_batches * float( + self.trainer.max_epochs) + self.total_steps = (len(train_loader.dataset) // + tb_size) // ab_size + + def configure_optimizers(self): + # if self.args.use_default_configure: + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + def training_step(self, batch, batch_idx): + if self.label_smoothing == 0: + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=batch['labels']) + + self.log('train_loss', output.loss, sync_dist=True) + return output.loss + + # TODO label_smoothing should be implemented at here + else: + labels = batch["labels"] + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + decoder_input_ids=batch['decoder_input_ids']) + + logits = output["logits"] + m = torch.nn.LogSoftmax(dim=-1) + lprobs = m(logits.float()) + loss, _ = self.loss_fn(lprobs.view(-1, lprobs.size(-1)), labels.view(-1), + self.label_smoothing, self.tokenizer.pad_token_id) + self.log('train_loss', loss, sync_dist=True) + return loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1, )) + + y_true = labels.view(size=(-1, )) + pad_mask = y_true.eq(1) + valid_length = y_true.ne(1).sum() + + corr = torch.eq(y_pred, y_true.float()) + corr.masked_fill_(pad_mask, 0.0) + acc = torch.sum(corr.float()) / valid_length + return acc + + def get_sufficient_stats(self, translations: List[str], references: List[str]) -> pd.DataFrame: + assert len(translations) == len(references), ( + f"There are {len(translations)} translated sentences " + f"but {len(references)} reference sentences" + ) + + # for sentence, ref in zip(translations, references): + + sentence_bleu = self.blue_metric.corpus_score(translations, [references]) + self.sufficient_stats.append( + [ + # Number of correct 1-grams, .., 4-grams + sentence_bleu.counts[0], + sentence_bleu.counts[1], + sentence_bleu.counts[2], + sentence_bleu.counts[3], + # Total number of 1-grams, .., 4-grams + sentence_bleu.totals[0], + sentence_bleu.totals[1], + sentence_bleu.totals[2], + sentence_bleu.totals[3], + # Length of translated sentence. + sentence_bleu.sys_len, + # Length of reference sentence. + sentence_bleu.ref_len, + ] + ) + + def on_validation_start(self) -> None: + # rm file at validation start + prefix, ext = os.path.splitext(self.hparams.output_save_path) + file_path_rank = '{}_{}{}'.format( + prefix, + self.trainer._accelerator_connector.cluster_environment. + global_rank(), ext) + if os.path.exists(file_path_rank): + # logger.debug('rm {}'.format(file_path_rank)) + os.remove(file_path_rank) + + def validation_step(self, batch, batch_idx): + + def postprocess_text(preds, labels, tgt_zh): + if tgt_zh: + preds = [pred.strip() for pred in preds] + labels = [label.strip() for label in labels] + else: + preds = list(map(lambda x: mose_decode(x.strip().split()), preds)) + labels = list(map(lambda x: mose_decode(x.strip().split()), labels)) + return preds, labels + + tmp_label = batch['labels'] + end_token_index = torch.where(tmp_label == self.tokenizer.eos_token_id)[1] + for idx, end_idx in enumerate(end_token_index): + tmp_label[idx][end_idx+1:] = -100 + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + labels=tmp_label) + generated_ids = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + max_length=self.hparams.max_dec_length) + + preds = self.tokenizer.batch_decode(generated_ids, + skip_special_tokens=True) + labels = torch.where(batch['labels'] != -100, batch['labels'], + self.tokenizer.pad_token_id) + + labels = self.tokenizer.batch_decode(labels, + skip_special_tokens=True) + + decoded_preds, decoded_labels = postprocess_text(preds, labels, self.args.tgt_zh) + # save preds for every rank + prefix, ext = os.path.splitext(self.hparams.output_save_path) + file_path_rank = '{}_{}{}'.format( + prefix, + self.trainer._accelerator_connector.cluster_environment. + global_rank(), ext) + self.save_prediction_to_file(preds=decoded_preds, + sources=batch['src'], + targets=decoded_labels, + ori_target=batch['tgt'], + file_path=file_path_rank) + + if self.args.tgt_zh: + new_preds = [chinese_char_tokenize(p) for p in decoded_preds] + new_labels = [chinese_char_tokenize(label) for label in decoded_labels] + self.get_sufficient_stats(new_preds, new_labels) + else: + self.get_sufficient_stats(decoded_preds, decoded_labels) + # batch_bleu = self.blue_metric.corpus_score(decoded_preds, [decoded_labels]).score + acc = self.comput_metrix(output.logits, batch['labels']) + self.log('val_loss', output.loss, sync_dist=True) + self.log('val_acc', acc, sync_dist=True) + + def validation_epoch_end(self, outputs): + rank_zero_info("***** Validation results *****") + sentence_states = pd.DataFrame( + self.sufficient_stats, + columns=[ + "correct_1_grams", + "correct_2_grams", + "correct_3_grams", + "correct_4_grams", + "total_1_grams", + "total_2_grams", + "total_3_grams", + "total_4_grams", + "translation_length", + "reference_length", + ] + ) + + computed_bleu = calc_bleu_from_stats(sentence_states) + rank_zero_info("valid_sacrebleu= {}\n".format(computed_bleu.score)) + self.log('valid_sacrebleu', computed_bleu.score, sync_dist=True) + self.sufficient_stats = [] + + def on_save_checkpoint(self, checkpoint) -> None: + if self.trainer._accelerator_connector.cluster_environment.global_rank( + ) == 0: + self.model.save_pretrained( + os.path.join( + self.trainer.checkpoint_callback.dirpath, + 'finetuned_epoch{}_step{}'.format( + checkpoint['epoch'], checkpoint['global_step']))) + + def save_prediction_to_file(self, preds, sources, targets, ori_target, file_path): + with open(file_path, 'a', encoding='utf-8') as f: + for idx, pred in enumerate(preds): + source = sources[idx] + target = targets[idx] + tmp_result = dict() + tmp_result['pred'] = pred + tmp_result['source'] = source + tmp_result['label'] = target + tmp_result['ori_label'] = ori_target[idx] + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data + '\n') + + def test_step(self, batch, batch_idx): + # print(batch) + texts = batch['src'] + # output summary and metrics + self.model.eval() + generated_ids = self.model.generate( + input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + max_length=self.hparams.max_dec_length + ) + preds = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + + labels = torch.where(batch['labels'] != -100, batch['labels'], + self.tokenizer.pad_token_id) + labels = self.tokenizer.batch_decode( + labels, skip_special_tokens=True, clean_up_tokenization_spaces=True) + + self.save_prediction_to_file(preds, texts, labels, self.hparams.output_save_path) + + +def configure_logger(logging_lever=logging.INFO): + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging_lever) + + +def main(): + args_parser = argparse.ArgumentParser("Pegasus Task") + args_parser.add_argument('--do_eval_only', + action='store_true', + default=False) + args_parser.add_argument('--other_model', + action='store_true', + default=False) + args_parser.add_argument('--reverse_src_tgt', + action='store_true', + default=False) + args_parser.add_argument('--tgt_zh', + action='store_true', + default=False) + args_parser.add_argument('--early_stopping_callback', + action='store_true', + default=False) + args_parser.add_argument('--pretrained_model_path', + default='facebook/mbart', + type=str) + args_parser.add_argument('--output_save_path', + default='predict.json', + type=str) + args_parser.add_argument('--max_enc_length', default=512, type=int) + args_parser.add_argument('--max_dec_length', default=512, type=int) + + # * Args for data preprocessing + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + + # * Args for training + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args_parser = FinetuneTranslation.add_model_specific_args(args_parser) + args_parser = add_module_args(args_parser) + args_parser = add_inverse_square_args(args_parser) + + args = args_parser.parse_args() + + if args.other_model: + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + else: + tokenizer = DeltalmTokenizer.from_pretrained(args.model_path) + # tokenizer = AutoTokenizer.from_pretrained(args.model_path) + print("tokenizer vocab size: ", tokenizer.vocab_size) + model = FinetuneTranslation(args, tokenizer) + collator = DataCollator(model.model, tokenizer, args.max_enc_length, args.max_dec_length, args.reverse_src_tgt) + data_model = UniversalDataModule(tokenizer=tokenizer, + args=args, + # datasets=dataset, + collate_fn=collator) + + lr_monitor = LearningRateMonitor(logging_interval='step') + + configure_logger(logging_lever=logging.INFO) + + if not args.do_eval_only: + + lr_monitor = LearningRateMonitor(logging_interval='step') + tensorboard_logger = loggers.TensorBoardLogger( + save_dir=os.path.join(args.default_root_dir, 'logs/'), + name=os.path.basename(os.path.dirname(args.model_path))) + checkpoint_callback = UniversalCheckpoint(args) + # early_stop = EarlyStopping(monitor=args.monitor, mode=args.mode) + trainer = Trainer.from_argparse_args( + args, logger=tensorboard_logger, callbacks=[lr_monitor, checkpoint_callback]) + trainer.fit(model, data_model) + + else: + trainer = Trainer.from_argparse_args(args) + trainer.validate(model, data_model) + # trainer.test(model, data_model) + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/translate/finetune_deltalm.sh b/fengshen/examples/translate/finetune_deltalm.sh new file mode 100644 index 0000000000000000000000000000000000000000..6d6bd9ef5fde6c9afd2957b79118e13b4e94d8da --- /dev/null +++ b/fengshen/examples/translate/finetune_deltalm.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +#SBATCH --job-name=mbart_en_zh +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH --cpus-per-task=32 +#SBATCH -o %x-%j.log + +set -x -e + +echo "START TIME: $(date)" + +MODEL_NAME=deltalm_en_zh +MICRO_BATCH_SIZE=16 +ROOT_DIR=../../workspace +MODEL_ROOT_DIR=$ROOT_DIR/${MODEL_NAME} + + +if [ ! -d ${MODEL_ROOT_DIR} ];then + mkdir ${MODEL_ROOT_DIR} + echo ${MODEL_ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${MODEL_ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +output_save_path=${MODEL_ROOT_DIR}.json +if [ -f ${output_save_path} ];then + echo ${output_save_path} exist, rm it!!!!!!!!!!!!!!!!! + rm ${output_save_path} +fi + +ZERO_STAGE=1 + +config_json="${MODEL_ROOT_DIR}/ds_config.${MODEL_NAME}.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": ${MICRO_BATCH_SIZE}, + "steps_per_print": 1000, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": false + }, + "zero_allow_untested_optimizer": false, + "fp16": { + "enabled": true + }, + "wall_clock_breakdown": false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + + +TRAINER_ARGS=" + --max_epochs 20 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_${ZERO_STAGE} \ + --default_root_dir ${MODEL_ROOT_DIR} \ + --save_ckpt_path ${MODEL_ROOT_DIR}/ckpt \ + --save_top_k 3 \ + --monitor valid_sacrebleu \ + --mode max \ + --save_last \ + --every_n_train_steps 0 \ + --val_check_interval 0.2 \ + --label_smoothing 0.1 \ + --warmup_steps 4000 \ + --learning_rate 1e-7 \ + --adam_beta2 0.98 \ + --scheduler_type inverse_sqrt \ + --reverse_src_tgt \ + --tgt_zh \ +" + +DATA_ARGS=" + --datasets_name case_test \ + --num_workers 8 \ + --train_batchsize $MICRO_BATCH_SIZE \ + --val_batchsize $MICRO_BATCH_SIZE \ + --test_batchsize $MICRO_BATCH_SIZE \ + --val_datasets_field val \ + --max_enc_length 256 \ + --max_dec_length 256 \ +" + +mode_path="IDEA-CCNL/Randeng-Deltalm-362M-En-Zn" + + +MODEL_ARGS=" + --model_path $mode_path \ + --output_save_path $output_save_path \ +" + +SCRIPTS_PATH=finetune_deltalm.py + +cat $SCRIPTS_PATH + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +source activate +conda activate fengshen +# srun python3 $CMD +python3 $CMD diff --git a/fengshen/examples/translate/prepare_dataset.py b/fengshen/examples/translate/prepare_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..5ce8cc74e05ab477a5863b99470c30c4073876c8 --- /dev/null +++ b/fengshen/examples/translate/prepare_dataset.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import json +import os + + +def main(file_path, src_lang, tgt_lang): + + file_list = ["train", "valid", "test"] + for filename in file_list: + sys.stderr.write("**** Start processing {} ... ****\n".format(filename)) + src_full_path = os.path.join(file_path, ".".join((filename, src_lang))) + tgt_full_path = os.path.join(file_path, ".".join((filename, tgt_lang))) + src_reader = open(src_full_path, 'r') + tgt_reader = open(tgt_full_path, "r") + + writer_full_path = os.path.join(file_path, ".".join((filename, src_lang + "_" + tgt_lang))) + writer = open(writer_full_path, "w") + # combine_dict = OrderedDict() + for row_src, row_tgt in zip(src_reader, tgt_reader): + combine_line = {} + combine_line["src"] = row_src.strip() + combine_line["tgt"] = row_tgt.strip() + json.dump(combine_line, writer, ensure_ascii=False) + writer.write('\n') + # print(row_src) + # print(row_tgt) + sys.stderr.write(f"**** Done change {filename} format **** \n") + + +if __name__ == "__main__": + file_path = sys.argv[1] + src_lang, tgt_lang = sys.argv[2].split("-") + + main(file_path, src_lang, tgt_lang) diff --git a/fengshen/examples/ubert/README.md b/fengshen/examples/ubert/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fdad2ca0d948830c51bf141dceb907c4531a4690 --- /dev/null +++ b/fengshen/examples/ubert/README.md @@ -0,0 +1,280 @@ +# Ubert: 统一 NLU 任务新范式 +- 论文:[https://arxiv.org/pdf/2206.12094.pdf](https://arxiv.org/pdf/2206.12094.pdf) +- 知乎:[https://zhuanlan.zhihu.com/p/539958182?](https://zhuanlan.zhihu.com/p/539958182?) + +### 简介 +Ubert 是我们在做 [2022AIWIN 世界人工智能创新大赛:中文保险小样本多任务](http://ailab.aiwin.org.cn/competitions/68#results) 时提出的一种解决方案。并取得A/B榜榜首的成绩,且B榜综合成绩领先第二名超过 1 个百分点,领先第三名接近 5 个百分点。相比于官方提供的 baseline,提高 20 个百分点。Ubert 不仅可以完成 实体识别、事件抽取等常见抽取任务,还可以完成新闻分类、自然语言推理等分类任务,且所有任务是共享一个统一框架、统一任务、统一训练目标的模型。解题思路和方案可以参考我们的答辩PPT,或者参考我们的[知乎文章](https://zhuanlan.zhihu.com/p/539958182?) + +## 开源模型列表 + 开源的模型是我们在比赛模型的基础上重新整理 70+ 份数据,共 100万+条样本,进行预训练而得到的,可直接开箱即用。开源模型地址如下: +| 模型 | 地址 | +|:---------:|:--------------:| +| Erlangshen-Ubert-110M-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-110M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-110M-Chinese) | +| Erlangshen-Ubert-330M-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-330M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-Ubert-330M-Chinese) | + + +## 快速开箱使用 +安装我们的 fengshen 框架,我们暂且提供如下方式安装 +```python +git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git +cd Fengshenbang-LM +pip install --editable ./ +``` + +一键运行下面代码得到预测结果, 你可以任意修改示例 text 和要抽取的 entity_type,体验一下 Zero-Shot 性能 +```python +import argparse +from fengshen import UbertPiplines + +total_parser = argparse.ArgumentParser("TASK NAME") +total_parser = UbertPiplines.piplines_args(total_parser) +args = total_parser.parse_args() + +test_data=[ + { + "task_type": "抽取任务", + "subtask_type": "实体识别", + "text": "这也让很多业主据此认为,雅清苑是政府公务员挤对了国家的经适房政策。", + "choices": [ + {"entity_type": "小区名字"}, + {"entity_type": "岗位职责"} + ], + "id": 0} +] + +model = UbertPiplines(args) +result = model.predict(test_data) +for line in result: + print(line) +``` + +## 继续 finetune 使用 + +开源的模型我们已经经过大量的数据进行预训练而得到,可以直接进行 Zero-Shot,如果你还想继续finetune,可以参考我们的 [example.py](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/fengshen/examples/ubert/example.py)。你只需要将我们数据预处理成为我们定义的格式,即可使用简单的几行代码完成模型的训练和推理。我们是复用 pytorch-lightning 的 trainer 。在训练时,可以直接传入 trainer 的参数,此外我们还定义了一些其他参数。常用的参数如下: + + +```sh +--pretrained_model_path #预训练模型的路径,默认 +--load_checkpoints_path #加载模型的路径,如果你finetune完,想加载模型进行预测可以传入这个参数 +--batchsize #批次大小, 默认 8 +--monitor #保存模型需要监控的变量,例如我们可监控 val_span_acc +--checkpoint_path #模型保存的路径, 默认 ./checkpoint +--save_top_k #最多保存几个模型, 默认 3 +--every_n_train_steps #多少步保存一次模型, 默认 100 +--learning_rate #学习率, 默认 2e-5 +--warmup #预热的概率, 默认 0.01 +--default_root_dir #模型日子默认输出路径 +--gradient_clip_val #梯度截断, 默认 0.25 +--gpus #gpu 的数量 +--check_val_every_n_epoch #多少次验证一次, 默认 100 +--max_epochs #多少个 epochs, 默认 5 +--max_length #句子最大长度, 默认 512 +--num_labels #训练每条样本最多取多少个label,超过则进行随机采样负样本, 默认 10 +``` + +## 数据预处理示例 + +整个模型的 Piplines 我们已经写好,所以为了方便,我们定义了数据格式。目前我们在预训练中主要含有一下几种任务类型 + +| task_type | subtask_type | +|:---------:|:--------------:| +| 分类任务 | 文本分类 | +| | 自然语言推理 | +| | 情感分析 | +| | 多项式阅读理解 | +| 抽取任务 | 实体识别 | +| | 事件抽取 | +| | 抽取式阅读理解 | +| | 关系抽取 | + +### 分类任务 + +#### 普通分类任务 +对于分类任务,我们把类别描述当作是 entity_type,我们主要关注 label 字段,label为 1 表示该该标签是正确的标签。如下面示例所示 +```json +{ + "task_type": "分类任务", + "subtask_type": "文本分类", + "text": "7000亿美元救市方案将成期市毒药", + "choices": [{ + "entity_type": "一则股票新闻", + "label": 1, + "entity_list": [] + }, { + "entity_type": "一则教育新闻", + "label": 0, + "entity_list": [] + }, { + "entity_type": "一则科学新闻", + "label": 0, + "entity_list": [] + }], + "id": 0 +} + +``` + +#### 自然语言推理 +```json +{ + "task_type": "分类任务", + "subtask_type": "自然语言推理", + "text": "在白云的蓝天下,一个孩子伸手摸着停在草地上的一架飞机的螺旋桨。", + "choices": [{ + "entity_type": "可以推断出:一个孩子正伸手摸飞机的螺旋桨。", + "label": 1, + "entity_list": [] + }, { + "entity_type": "不能推断出:一个孩子正伸手摸飞机的螺旋桨。", + "label": 0, + "entity_list": [] + }, { + "entity_type": "很难推断出:一个孩子正伸手摸飞机的螺旋桨。", + "label": 0, + "entity_list": [] + }], + "id": 0 +} +``` + + +#### 语义匹配 + +```json +{ + "task_type": "分类任务", + "subtask_type": "语义匹配", + "text": "不要借了我是试试看能否操作的", + "choices": [{ + "entity_type": "不能理解为:借款审核期间能否取消借款", + "label": 1, + "entity_list": [] + }, { + "entity_type": "可以理解为:借款审核期间能否取消借款", + "label": 0, + "entity_list": [] + }], + "id": 0 +} + +``` + +### 抽取任务 +对于抽取任务,label 字段是无效的 +#### 实体识别 +```json +{ + "task_type": "抽取任务", + "subtask_type": "实体识别", + "text": "彭小军认为,国内银行现在走的是台湾的发卡模式,先通过跑马圈地再在圈的地里面选择客户,", + "choices": [{ + "entity_type": "地址", + "label": 0, + "entity_list": [{ + "entity_name": "台湾", + "entity_type": "地址", + "entity_idx": [ + [15, 16] + ] + }] + }{ + "entity_type": "政府机构", + "label": 0, + "entity_list": [] + }, { + "entity_type": "电影名称", + "label": 0, + "entity_list": [] + }, { + "entity_type": "人物姓名", + "label": 0, + "entity_list": [{ + "entity_name": "彭小军", + "entity_type": "人物姓名", + "entity_idx": [ + [0, 2] + ] + }] + }, + "id": 0 +} + +``` +#### 事件抽取 +```json + +{ + "task_type": "抽取任务", + "subtask_type": "事件抽取", + "text": "小米9价格首降,6GB+128GB跌了200,却不如红米新机值得买", + "choices": [{ + "entity_type": "降价的时间", + "label": 0, + "entity_list": [] + }, { + "entity_type": "降价的降价方", + "label": 0, + "entity_list": [] + }, { + "entity_type": "降价的降价物", + "label": 0, + "entity_list": [{ + "entity_name": "小米9", + "entity_type": "降价的降价物", + "entity_idx": [ + [0, 2] + ] + }, { + "entity_name": "小米9", + "entity_type": "降价的降价物", + "entity_idx": [ + [0, 2] + ] + }] + }, { + "entity_type": "降价的降价幅度", + "label": 0, + "entity_list": [] + }], + "id": 0 +} +``` +#### 抽取式阅读理解 + +```json +{ + "task_type": "抽取任务", + "subtask_type": "抽取式阅读理解", + "text": "截至2014年7月1日,圣地亚哥人口估计为1381069人,是美国第八大城市,加利福尼亚州第二大城市。它是圣迭戈-蒂华纳城市群的一部分,是美国与底特律-温莎之后的第二大跨境城市群,人口4922723。圣地亚哥是加州的出生地,以全年温和的气候、天然的深水港、广阔的海滩、与美国海军的长期联系以及最近作为医疗和生物技术发展中心而闻名。", + "choices": [{ + "entity_type": "除了医疗保健,圣迭戈哪个就业部门已经强势崛起?", + "label": 0, + "entity_list": [{ + "entity_name": "生物技术发展", + "entity_idx": [ + [153, 158] + ] + }] + }, { + "entity_type": "在所有的军事部门中,哪一个在圣地亚哥的存在最为强大?", + "label": 0, + "entity_list": [{ + "entity_name": "美国海军", + "entity_idx": [ + [135, 138] + ] + }] + }, { + "entity_type": "在美国十大城市中,圣迭戈排名哪一位?", + "label": 0, + "entity_list": [{ + "entity_name": "第八", + "entity_idx": [ + [33, 34] + ] + }] + }], + "id": 0 +} +``` + diff --git a/fengshen/examples/ubert/example.py b/fengshen/examples/ubert/example.py new file mode 100644 index 0000000000000000000000000000000000000000..bedd365ff67ff5d9b1f8f22777dab9b5a8b02394 --- /dev/null +++ b/fengshen/examples/ubert/example.py @@ -0,0 +1,95 @@ +import argparse +from fengshen import UbertPipelines +import os +os.environ["CUDA_VISIBLE_DEVICES"] = '6' + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser = UbertPipelines.pipelines_args(total_parser) + args = total_parser.parse_args() + + # 设置一些训练要使用到的参数 + args.pretrained_model_path = 'IDEA-CCNL/Erlangshen-Ubert-110M-Chinese' #预训练模型的路径,我们提供的预训练模型存放在HuggingFace上 + args.default_root_dir = './' #默认主路径,用来放日志、tensorboard等 + args.max_epochs = 5 + args.gpus = 1 + args.batch_size = 1 + + # 只需要将数据处理成为下面数据的 json 样式就可以一键训练和预测,下面只是提供了一条示例样本 + train_data = [ + { + "task_type": "抽取任务", + "subtask_type": "实体识别", + "text": "彭小军认为,国内银行现在走的是台湾的发卡模式,先通过跑马圈地再在圈的地里面选择客户,", + "choices": [ + {"entity_type": "地址", "label": 0, "entity_list": [ + {"entity_name": "台湾", "entity_type": "地址", "entity_idx": [[15, 16]]}]}, + {"entity_type": "书名", "label": 0, "entity_list": []}, + {"entity_type": "公司", "label": 0, "entity_list": []}, + {"entity_type": "游戏", "label": 0, "entity_list": []}, + {"entity_type": "政府机构", "label": 0, "entity_list": []}, + {"entity_type": "电影名称", "label": 0, "entity_list": []}, + {"entity_type": "人物姓名", "label": 0, "entity_list": [ + {"entity_name": "彭小军", "entity_type": "人物姓名", "entity_idx": [[0, 2]]}]}, + {"entity_type": "组织机构", "label": 0, "entity_list": []}, + {"entity_type": "岗位职位", "label": 0, "entity_list": []}, + {"entity_type": "旅游景点", "label": 0, "entity_list": []} + ], + "id": 0} + ] + dev_data = [ + { + "task_type": "抽取任务", + "subtask_type": "实体识别", + "text": "就天涯网推出彩票服务频道是否是业内人士所谓的打政策“擦边球”,记者近日对此事求证彩票监管部门。", + "choices": [ + {"entity_type": "地址", "label": 0, "entity_list": []}, + {"entity_type": "书名", "label": 0, "entity_list": []}, + {"entity_type": "公司", "label": 0, "entity_list": [ + {"entity_name": "天涯网", "entity_type": "公司", "entity_idx": [[1, 3]]}]}, + {"entity_type": "游戏", "label": 0, "entity_list": []}, + {"entity_type": "政府机构", "label": 0, "entity_list": []}, + {"entity_type": "电影名称", "label": 0, "entity_list": []}, + {"entity_type": "人物姓名", "label": 0, "entity_list": []}, + {"entity_type": "组织机构", "label": 0, "entity_list": [ + {"entity_name": "彩票监管部门", "entity_type": "组织机构", "entity_idx": [[40, 45]]}]}, + {"entity_type": "岗位职位", "label": 0, "entity_list": [ + {"entity_name": "记者", "entity_type": "岗位职位", "entity_idx": [[31, 32]]}]}, + {"entity_type": "旅游景点", "label": 0, "entity_list": []} + ], + + "id": 0} + + ] + test_data = [ + { + "task_type": "抽取任务", + "subtask_type": "实体识别", + "text": "这也让很多业主据此认为,雅清苑是政府公务员挤对了国家的经适房政策。", + "choices": [ + {"entity_type": "地址", "label": 0, "entity_list": [ + {"entity_name": "雅清苑", "entity_type": "地址", "entity_idx": [[12, 14]]}]}, + {"entity_type": "书名", "label": 0, "entity_list": []}, + {"entity_type": "公司", "label": 0, "entity_list": []}, + {"entity_type": "游戏", "label": 0, "entity_list": []}, + {"entity_type": "政府机构", "label": 0, "entity_list": []}, + {"entity_type": "电影名称", "label": 0, "entity_list": []}, + {"entity_type": "人物姓名", "label": 0, "entity_list": []}, + {"entity_type": "组织机构", "label": 0, "entity_list": []}, + {"entity_type": "岗位职位", "label": 0, "entity_list": [ + {"entity_name": "公务员", "entity_type": "岗位职位", "entity_idx": [[18, 20]]}]}, + {"entity_type": "旅游景点", "label": 0, "entity_list": []} + ], + "id": 0}, + ] + + model = UbertPipelines(args) + model.fit(train_data, dev_data) + result = model.predict(test_data) + for line in result: + print(line) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/unimc/README.md b/fengshen/examples/unimc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..16abf3ff69c5ab7b8b8ca1f7c7ec191cbdf64ec0 --- /dev/null +++ b/fengshen/examples/unimc/README.md @@ -0,0 +1,221 @@ +[**中文**](./README.md) | [**English**](./README_en.md) +# UniMC + +EMNLP 2022 论文 《[Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective](https://arxiv.org/abs/2210.08590)》源码 + +![](./unimc.jpg) + +## Update +- [2022-10-18] Release preprint in arXiv. +- [2022-10-14] Release code in GitHub. + +## Requirements + +安装 fengshen 框架 + +```shell +git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git +cd Fengshenbang-LM +pip install --editable . +``` + +## Quick Start + +你可以参考我们的 [example.py](./example.py) 脚本,只需要将处理好的 train、dev、test 即输入模型即可。 +```python +import argparse +from fengshen.pipelines.multiplechoice import UniMCPipelines + +total_parser = argparse.ArgumentParser("TASK NAME") +total_parser = UniMCPipelines.piplines_args(total_parser) +args = total_parser.parse_args() + +pretrained_model_path = 'IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese' +args.learning_rate=2e-5 +args.max_length=512 +args.max_epochs=3 +args.batchsize=8 +args.default_root_dir='./' +model = UniMCPipelines(args,model_path=pretrained_model_path) + +train_data = [] +dev_data = [] +test_data = [{ + "texta": "就是废物,充电不进害得老子把主板烧了,客服不耐烦", + "textb": "", + "question": "", + "choice": ["这是一条差评", "这是一条好评"], + "answer": "这是一条差评", + "label": 0, + "id": 31 +}] + +if args.train: + model.train(train_data, dev_data) +result = model.predict(test_data) +``` +## Pretrained Model +对于英文模型,我们使用14份 multiplechoice 数据集进行了预训练。在中文模型中,我们已经收集了48份数据集对模型进行预训练,我们已经将预训练模型开源到 HuggingFace 社区当中。 + +| 模型 | 地址 | +|:---------:|:--------------:| +| Erlangshen-UniMC-Albert-235M-English | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English) | +| Erlangshen-UniMC-RoBERTa-110M-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese) | +| Erlangshen-UniMC-RoBERTa-330M-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese) | +| Erlangshen-UniMC-MegatronBERT-1.3B-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese) | + +## Experiments + + +### English + +为了测评 UniMC 的性能,在英文中,我们使用 14份 multiple-choice 数据集(具体数据参考原论文)来对模型进行预训练,使其具备做选择题的能力, + +**Zero-shot** +| Model | T0 11B | GLaM 60B | FLAN 137B | PaLM 540B | UniMC 235M | +|---------|--------|----------|-----------|-----------|------------| +| ANLI R1 | 43.6 | 40.9 | 47.7 | 48.4 | **52.0** | +| ANLI R2 | 38.7 | 38.2 | 43.9 | 44.2 | **44.4** | +| ANLI R3 | 41.3 | 40.9 | 47.0 | 45.7 | **47.8** | +| CB | 70.1 | 33.9 | 64.1 | 51.8 | **75.7** | +### Chinese + +为了测评 UniMC 在中文场景下的性能我们使用 13份 有监督数据集来对模型进行预训练,预训练数据如下: +| Task type | Task | # of option | Data size | +|---------|--------|----------|-----------| +| Multiple-choice | c3 | 4 | 11.8k | +| Multiple-choice | ClozeT | 2 | 0.7k | +| Multiple-choice | CMRC2019 | n | 11.4k | +| Multiple-choice | GCRC | 4 | 7.8k | +| Classification | DuEE-Fin | 12 | 4.3k | +| Classification | DuEE1.0 | 65 | 10.3k | +| Classification | Fudan | 20 | 19.6k | +| Classification | THUNEWS | 10 | 180k | +| NLI | CMNLI | 3 | 39k | +| NLI | SNLI | 3 | 545.8k | +| Paraphrace | AFQMC | 2 | 34.3k | +| Paraphrace | PAWS-X | 2 | 49k | +| Paraphrace | STS-B | 2 | 80k | + +我们使用中文领域常用的benchmark来测试UniMC的性能,具体是FewCLUE的9个任务,我们在 test_public 上测评模型的性能。 + + +**Few-shot** +| Model | eprstmt | csldcp | tnews | iflytek | ocnli | bustm | chid | csl | wsc | Avg | +|------------|------------|----------|-----------|----------|-----------|-----------|-----------|----------|-----------|-----------| +| Finetuning | 65.4 | 35.5 | 49 | 32.8 | 33 | 60.7 | 14.9 | 50 | 55.6 | 44.1 | +| PET | 86.7 | 51.7 | 54.5 | 46 | 44 | 56 | 61.2 | 59.4 | 57.5 | 57.44 | +| LM-BFF | 85.6 | 54.4 | 53 | 47.1 | 41.6 | 57.6 | 61.2 | 51.7 | 54.7 | 56.32 | +| P-tuning | 88.3 | 56 | 54.2 | **57.6** | 41.9 | 60.9 | 59.3 | **62.9** | 58.1 | 59.91 | +| EFL | 84.9 | 45 | 52.1 | 42.7 | 66.2 | 71.8 | 30.9 | 56.6 | 53 | 55.91 | +| [UniMC-RoBERTa-110M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese) | 88.64 | 54.08 | 54.32 | 48.6 | 66.55 | 73.76 | 67.71 | 52.54 | 59.92 | 62.86 | +| [UniMC-RoBERTa-330M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese) | 89.53 | 57.3 | 54.25 | 50 | 70.59 | 77.49 | 78.09 | 55.73 | 65.16 | 66.46 | +| [UniMC-MegatronBERT-1.3B](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese) | **89.278** | **60.9** | **57.46** | 52.89 | **76.33** | **80.37** | **90.33** | 61.73 | **79.15** | **72.05** | + +**Zero-shot** + +| Model | eprstmt | csldcp | tnews | iflytek | ocnli | bustm | chid | csl | wsc | Avg | +|---------------|-----------|-----------|-----------|-----------|-----------|----------|----------|----------|-----------|-----------| +| GPT-zero | 57.5 | 26.2 | 37 | 19 | 34.4 | 50 | 65.6 | 50.1 | 50.3 | 43.4 | +| PET-zero | 85.2 | 12.6 | 26.1 | 26.6 | 40.3 | 50.6 | 57.6 | 52.2 | 54.7 | 45.1 | +| NSP-BERT | 86.9 | 47.6 | 51 | 41.6 | 37.4 | 63.4 | 52 | **64.4** | 59.4 | 55.96 | +| ZeroPrompt | - | - | - | 16.14 | 46.16 | - | - | - | 47.98 | - | +| Yuan1.0-13B | 88.13 | 38.99 | 57.47 | 38.82 | 48.13 | 59.38 | 86.14 | 50 | 38.99 | 56.22 | +| ERNIE3.0-240B | 88.75 | **50.97** | **57.83** | **40.42** | 53.57 | 64.38 | 87.13 | 56.25 | 53.46 | 61.41 | +| [UniMC-RoBERTa-110M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese) | 86.16 | 31.26 | 46.61 | 26.54 | 66.91 | 73.34 | 66.68 | 50.09 | 53.66 | 55.7 | +| [UniMC-RoBERTa-330M](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese) | 87.5 | 30.4 | 47.6 | 31.5 | 69.9 | 75.9 | 78.17 | 49.5 | 60.55 | 59.01 | +| [UniMC-MegatronBERT-1.3B](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese) | **88.79** | 42.06 | 55.21 | 33.93 | **75.57** | **79.5** | **89.4** | 50.25 | **66.67** | **64.53** | + + + +## Dataset + +我们已经定义好了 UniMC 所需的数据格式,你只需要将数据转化为下面的数据格式即可: + +### 文本分类 +```json +{ + "texta": "街头偶遇2018款长安CS35,颜值美炸!或售6万起,还买宝骏510?", + "textb": "", + "question": "下面新闻属于哪一个类别?", + "choice": [ + "房产", + "汽车", + "教育", + "军事" + ], + "answer": "汽车", + "label": 1, + "id": 7759 +} + +``` + +### 情感分析 +```json +{ + "texta": "就是废物,充电不进害得老子把主板烧了,客服不耐烦", + "textb": "", + "question": "", + "choice": ["这是一条差评", "这是一条好评"], + "answer": "这是一条差评", + "label": 0, + "id": 31 +} + +``` + +### 语义匹配 +```json +{ + "texta": "不要借了我是试试看能否操作的", + "textb": "", + "question": "", + "choice": ["不能理解为:借款审核期间能否取消借款", "可以理解为:借款审核期间能否取消借款"], + "answer": "不能理解为:借款审核期间能否取消借款", + "label": 0, + "id": 0 +} + +``` + +### 自然语言推理 +```json +{ + "texta": "身上裹一件工厂发的棉大衣,手插在袖筒里", + "textb": "", + "question": "", + "choice": ["不能推断出:身上至少一件衣服", "很难推断出:身上至少一件衣服", "可以推断出:身上至少一件衣服"], + "answer": "可以推断出:身上至少一件衣服", + "label": 2, + "id": 0 +} + +``` + + +## Citation +如果你觉得本仓库帮助到了你,你可以使用下面方式引用我们的工作 + +```text +@article{unimc, + author = {Ping Yang and + Junjie Wang and + Ruyi Gan and + Xinyu Zhu and + Lin Zhang and + Ziwei Wu and + Xinyu Gao and + Jiaxing Zhang and + Tetsuya Sakai}, + title = {Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective}, + journal = {CoRR}, + volume = {abs/2210.08590}, + year = {2022} +} +``` + +## License + +[Apache License 2.0](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/LICENSE) + diff --git a/fengshen/examples/unimc/README_en.md b/fengshen/examples/unimc/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..0a1e86888c5cfc3046527613f603f96729cdab08 --- /dev/null +++ b/fengshen/examples/unimc/README_en.md @@ -0,0 +1,104 @@ +[**中文**](./README.md) | [**English**](./README_en.md) +# UniMC +Code for [Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective](https://arxiv.org/abs/2210.08590) + + + +![](./unimc.jpg) + +## Update +- [2022-10-18] Release preprint in arXiv. +- [2022-10-14] Release code in GitHub. + +## Requirements + + +```shell +git clone https://github.com/IDEA-CCNL/Fengshenbang-LM.git +cd Fengshenbang-LM +pip install --editable . +``` + +## Quick Start +You can refer to our [example.py]() + +```python +import argparse +from fengshen.pipelines.multiplechoice import UniMCPipelines + +total_parser = argparse.ArgumentParser("TASK NAME") +total_parser = UniMCPipelines.piplines_args(total_parser) +args = total_parser.parse_args() + +pretrained_model_path = 'IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English' +args.language='english' +args.learning_rate=2e-5 +args.max_length=512 +args.max_epochs=3 +args.batchsize=8 +args.default_root_dir='./' +model = UniMCPipelines(args, model_path=pretrained_model_path) + +train_data = [] +dev_data = [] +test_data = [{ + "texta": "it 's just incredibly dull .", + "textb": "", + "question": "What is sentiment of follow review?", + "choice": ["it's great", "it's terrible"], + "answer": "", + "label": 0, + "id": 19 +}] + +if args.train: + model.train(train_data, dev_data) +result = model.predict(test_data) +``` +## Pretrained Model +For the English model, the model was pre-trained with 14 multiplechoice datasets. For the Chinese model, we have collected 48 datasets to pre-train the model, and we have open sourced the pre-trained model to the HuggingFace community. + +| Model | URL | +|:---------:|:--------------:| +| Erlangshen-UniMC-Albert-235-English | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-Albert-235M-English) | +| Erlangshen-UniMC-RoBERTa-110M-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese) | +| Erlangshen-UniMC-RoBERTa-330M-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-UnimC-RoBERTa-330M-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-RoBERTa-330M-Chinese) | +| Erlangshen-UniMC-MegatronBERT-1.3B-Chinese | [https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese](https://huggingface.co./IDEA-CCNL/Erlangshen-UniMC-MegatronBERT-1.3B-Chinese) | + + +## Experiments +To evaluate the performance of UniMC, we use 14 multiple-choice datasets to pre-train the model with the ability to make choices + +**Zero-shot** +| Model | T0 11B | GLaM 60B | FLAN 137B | PaLM 540B | UniMC 235M | +|---------|--------|----------|-----------|-----------|------------| +| ANLI R1 | 43.6 | 40.9 | 47.7 | 48.4 | **52.0** | +| ANLI R2 | 38.7 | 38.2 | 43.9 | 44.2 | **44.4** | +| ANLI R3 | 41.3 | 40.9 | 47.0 | 45.7 | **47.8** | +| CB | 70.1 | 33.9 | 64.1 | 51.8 | **75.7** | + +## Citation +If this repository helps you, please cite this paper: + +```text +@article{unimc, + author = {Ping Yang and + Junjie Wang and + Ruyi Gan and + Xinyu Zhu and + Lin Zhang and + Ziwei Wu and + Xinyu Gao and + Jiaxing Zhang and + Tetsuya Sakai}, + title = {Zero-Shot Learners for Natural Language Understanding via a Unified Multiple Choice Perspective}, + journal = {CoRR}, + volume = {abs/2210.08590}, + year = {2022} +} +``` + +## License + +[Apache License 2.0](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/main/LICENSE) + diff --git a/fengshen/examples/unimc/example.py b/fengshen/examples/unimc/example.py new file mode 100644 index 0000000000000000000000000000000000000000..8f6a257ad2438fe2158c6a66cb69b0ce9704e90b --- /dev/null +++ b/fengshen/examples/unimc/example.py @@ -0,0 +1,82 @@ +import argparse +from fengshen.pipelines.multiplechoice import UniMCPipelines + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser = UniMCPipelines.piplines_args(total_parser) + args = total_parser.parse_args() + + pretrained_model_path = 'IDEA-CCNL/Erlangshen-UniMC-RoBERTa-110M-Chinese' + args.learning_rate = 2e-5 + args.max_length = 512 + args.max_epochs = 3 + args.batchsize = 8 + args.train = 'train' + args.default_root_dir = './' + + model = UniMCPipelines(args, model_path=pretrained_model_path) + + train_data = [ # 训练数据 + { + "texta": "凌云研发的国产两轮电动车怎么样,有什么惊喜?", + "textb": "", + "question": "下面新闻属于哪一个类别?", + "choice": [ + "教育", + "科技", + "军事", + "旅游", + "国际", + "股票", + "农业", + "电竞" + ], + "answer": "科技", + "label": 1, + "id": 0 + } + ] + dev_data = [ # 验证数据 + { + "texta": "我四千一个月,老婆一千五一个月,存款八万且有两小孩,是先买房还是先买车?", + "textb": "", + "question": "下面新闻属于哪一个类别?", + "choice": [ + "故事", + "文化", + "娱乐", + "体育", + "财经", + "房产", + "汽车" + ], + "answer": "汽车", + "label": 6, + "id": 0 + } + ] + test_data = [ # 测试数据 + {"texta": "街头偶遇2018款长安CS35,颜值美炸!或售6万起,还买宝骏510?", + "textb": "", + "question": "下面新闻属于哪一个类别?", + "choice": [ + "房产", + "汽车", + "教育", + "军事" + ], + "answer": "汽车", + "label": 1, + "id": 7759} + ] + + if args.train: + model.train(train_data, dev_data) + result = model.predict(test_data) + for line in result: + print(line) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/unimc/unimc.jpg b/fengshen/examples/unimc/unimc.jpg new file mode 100644 index 0000000000000000000000000000000000000000..53715d9ac87d78b8d6cbcf65f7c8190a6e0fae05 Binary files /dev/null and b/fengshen/examples/unimc/unimc.jpg differ diff --git a/fengshen/examples/wenzhong_qa/README.md b/fengshen/examples/wenzhong_qa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8b424909f39c5b1480fbc5cc7015e82714292930 --- /dev/null +++ b/fengshen/examples/wenzhong_qa/README.md @@ -0,0 +1,75 @@ +#
yuyuanQA模型finetune +本示例主要实现了基于GPT2结构的Yuyuan医疗大模型,通过医疗问答对Finetune,使大模型能够有closebook-qa的能力。 +### 数据和模型 +#### 模型: +finetune的模型是yuyuan模型,余元模型是GPT2的结构,在预训练阶段主要是用PubMed医疗相关的数据集进行的预训练。是一个医疗领域的大模型。模型共有35亿参数,主要参数如下表所示: + +| 配置 | 参数 | +| :---------: | :---: | +| nlayers | 30 | +| nheaders | 32 | +| hidden-size | 3072 | +| seq-length | 1024 | + +预训练的数据,主要医疗相关的论文、杂志期刊等,以英文语料为主。 +#### 数据: +用于finetune的语料是清洗于[MedQuAD](https://github.com/abachaa/MedQuAD)数据集,清洗完成后是下面的格式: +```text +...... +{'question':'.........','answer':'........'} +{'question':'.........','answer':'........'} +...... +``` +### finetune框架以及参数配置 +#### 框架 : +finetune的框架是IDEA研究院CCNL小组整合各大框架的优点开源的[封神框架](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen),具体代码可以参考[finetune_medicalQA.py](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/dev_wzw/fengshen/examples/wenzhong_qa/finetune_medicalQA.py)和[medicalQADataset.py](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/dev_wzw/fengshen/data/task_dataloader/medicalQADataset.py)。 +#### 训练参数: +训练参数,我们采用了deepspeed相关的配置,用2个集群的节点共16张A100,在很短的时间内完成了finetune。具体参数配置可以参考[finetune_GPT2_medicalQA.sh](https://github.com/IDEA-CCNL/Fengshenbang-LM/blob/dev_wzw/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh) +### finetune后的效果以及使用 +#### 效果对比: +finetune后的模型,用100对问答对,基于BLEU分与之前用Magetron框架训练的模型进行了简单的对比,效果比较接近。 + +unsmoth method: +| 框架 | 1-gram | 2-gram | 3-gram | 4-gram | +| -------- | ------------------ | ------------------ | ------------------ | ------------------- | +| Fengshen | 0.5241376169070796 | 0.5215762466122144 | 0.4894353584800885 | 0.44840139357073466 | +| Magetron | 0.5321340489166898 | 0.5110257474778213 | 0.4703745962926368 | 0.4310875933354554 | + +smoth method: +| 框架 | 1-gram | 2-gram | 3-gram | 4-gram | +| -------- | ----------------- | ------------------ | ------------------ | ------------------ | +| Fengshen | 0.717829796617609 | 0.6516910802858905 | 0.5859726677095979 | 0.525510691686505 | +| Magetron | 0.776190980974117 | 0.6749801211321476 | 0.5897846253142169 | 0.5230773076722481 | +#### 使用方式: +支持直接用Haggingface或者pytorch-lightning框架调用。由于在finetune的时候,加入了prompt,在问答的时候,输入应该是:" +`Question:your question about medical? answer:`",接着模型就回以续写的方式回答你的问题。用huggingface的调用代码可以参考下面的代码: +```python +from transformers import GPT2Tokenizer,GPT2LMHeadModel +model_path = 'pretrained_model_hf/yuyuanQA-v1' # input your own model file path +model = GPT2LMHeadModel.from_pretrained(model_path) +tokenizer = GPT2Tokenizer.from_pretrained(model_path) +model = model.cuda(6) # move your model to the GPU +model.eval() # just do predict + +def answering(question): +# question = "What should gout patients pay attention to in diet?" + inputs = tokenizer(f'Question:{question} answer:',return_tensors='pt').input_ids.to(model.device) + + generation_output = model.generate(input_ids = inputs, + return_dict_in_generate=True, + output_scores=True, + max_length=150, + # max_new_tokens=80, + do_sample=True, + top_p = 0.9, + eos_token_id=50256, + pad_token_id=0, + num_return_sequences = 5) + answers = [] + for idx,sentence in enumerate(generation_output.sequences): + next_sentence = tokenizer.decode(sentence).split('<|endoftext|>')[0] + answer = next_sentence.split(sep='answer:',maxsplit=1)[1] + answers.append(answer) + return answers +answering('your question?') +``` \ No newline at end of file diff --git a/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh b/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9a81670ed121ecfb9fa3e0e546f0773374087af --- /dev/null +++ b/fengshen/examples/wenzhong_qa/finetune_GPT2_medicalQA.sh @@ -0,0 +1,123 @@ +#!/bin/bash +#SBATCH --job-name=medical_qa_finetune +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=8 +#SBATCH --gres=gpu:8 # number of gpus +#SBATCH -o /cognitive_comp/wuziwei/task/fs_medical_qa_finetune/%x-%j.log +#SBATCH -e /cognitive_comp/wuziwei/task/fs_medical_qa_finetune/%x-%j.err +#SBATCH -x dgx[050,049] + +#export NCCL_DEBUG=INFO + +# export PATH=$PATH:/cognitive_comp/wuziwei/codes/fengshen/fengshen +set -x -e + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=1 +ROOT_DIR=/cognitive_comp/wuziwei/task/fs_medical_qa_finetune + +ZERO_STAGE=2 + +config_json="$ROOT_DIR/training_config.json" +export MASTER_PORT=$[RANDOM%10000+30000] + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "zero_optimization": { + "stage": $ZERO_STAGE, + "contiguous_gradients": true, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "allgather_bucket_size": 2e8 + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 1e-5, + "betas": [0.9,0.95], + "eps": 1e-8, + "weight_decay": 1e-2 + } + }, + "scheduler": { + "type": "WarmupLR", + "params":{ + "warmup_min_lr": 5e-6, + "warmup_max_lr": 1e-5 + } + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 32, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": false, + "contiguous_memory_optimization": false + }, + "wall_clock_breakdown": false, + "zero_allow_untested_optimizer": false, + "train_micro_batch_size_per_gpu": 1, + "steps_per_print": 100, + "gradient_clipping": 1.0 +} +EOT + +# export PL_DEEPSPEED_CONFIG_PATH=$config_json +export PL_DEEPSPEED_CONFIG_PATH=$config_json +export TORCH_EXTENSIONS_DIR=/cognitive_comp/wuziwei/torch_extendsions +TRAINER_ARGS=" + --max_epochs 10 \ + --gpus 16 \ + --num_nodes 2 \ + --strategy deepspeed_stage_2 \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor train_loss \ + --mode min \ + --save_last \ +" +DATA_DIR=/cognitive_comp/wuziwei/task-data/medical_qa +DATA_ARGS=" + --data_dir $DATA_DIR \ + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data train.txt \ + --valid_data valid.txt \ + --test_data test.txt +" + +# PRETRAINED_MODEL_PATH=/cognitive_comp/wuziwei/pretrained_model_hf/gpt2 +PRETRAINED_MODEL_PATH=/cognitive_comp/wuziwei/pretrained_model_hf/medical_v2 +MODEL_ARGS=" + --pretrained_model_path ${PRETRAINED_MODEL_PATH} \ + --output_save_path $ROOT_DIR/predict.json \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --warmup 0.01 \ +" + +SCRIPTS_PATH=/cognitive_comp/wuziwei/codes/fengshen/fengshen/examples/GPT_pretrain_finetune/finetune_medicalQA.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +SINGULARITY_PATH=/cognitive_comp/wuziwei/container/oneflow-cuda11.sif +# singularity exec --nv -B /cognitive_comp/wuziwei/:/cognitive_comp/wuziwei/ $SINGULARITY_PATH python $CMD + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" + +srun singularity exec --nv -B /cognitive_comp/wuziwei/:/cognitive_comp/wuziwei/ $SINGULARITY_PATH bash -c 'python $CMD' diff --git a/fengshen/examples/wenzhong_qa/finetune_medicalQA.py b/fengshen/examples/wenzhong_qa/finetune_medicalQA.py new file mode 100644 index 0000000000000000000000000000000000000000..1a79948d5f7fe736856e44392a834edfa6ac51d9 --- /dev/null +++ b/fengshen/examples/wenzhong_qa/finetune_medicalQA.py @@ -0,0 +1,176 @@ +from transformers import GPT2LMHeadModel +from data.task_dataloader.medicalQADataset import GPT2QADataModel +from transformers.optimization import get_linear_schedule_with_warmup +from pytorch_lightning import Trainer, loggers +from pytorch_lightning.callbacks import ModelCheckpoint +import pytorch_lightning as pl +import argparse +import torch +import os +import sys +sys.path.insert(0, '/cognitive_comp/wuziwei/codes/fengshen/fengshen') +# sys.path.append('../../') +# sys.path.append('../') +# os.environ["CUDA_VISIBLE_DEVICES"] = '4,5,6,7' + + +class GPT2FinetuneMedicalQAModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./ckpt/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + parser.add_argument('--save_last', action='store_true', default=True) + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=1000, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + # every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename, + save_last=args.save_last) + + +class GPT2FinetuneMedicalQA(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--learning_rate', default=1e-4, type=float) + parser.add_argument('--weight_decay', default=0.1, type=float) + parser.add_argument('--warmup', default=0.01, type=float) + return parent_args + + def __init__(self, args, num_data): + super().__init__() + self.args = args + self.num_data = num_data + print('num_data:', num_data) + self.model = GPT2LMHeadModel.from_pretrained( + args.pretrained_model_path) + + def setup(self, stage) -> None: + if stage == 'fit': + num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0 + self.total_step = int(self.trainer.max_epochs * self.num_data / + (max(1, num_gpus) * self.trainer.accumulate_grad_batches)) + print('Total training step:', self.total_step) + + def training_step(self, batch, batch_idx): + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], labels=batch['labels']) + # output = self.model(input_ids=batch['input_ids'], labels=batch['labels']) + # acc = self.comput_metrix(output.logits, batch['labels']) + self.log('train_loss', output.loss) + return output.loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + output = self.model(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], labels=batch['labels']) + # output = self.model(input_ids=batch['input_ids'], labels=batch['labels']) + # acc = self.comput_metrix(output.logits, batch['labels']) + self.log('val_loss', output.loss) + # self.log('val_acc', acc) + + def configure_optimizers(self): + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + paras = list( + filter(lambda p: p[1].requires_grad, self.named_parameters())) + paras = [{ + 'params': + [p for n, p in paras if not any(nd in n for nd in no_decay)], + 'weight_decay': self.args.weight_decay + }, { + 'params': [p for n, p in paras if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0 + }] + optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate) + scheduler = get_linear_schedule_with_warmup( + optimizer, int(self.total_step * self.args.warmup), + self.total_step) + + return [{ + 'optimizer': optimizer, + 'lr_scheduler': { + 'scheduler': scheduler, + 'interval': 'step', + 'frequency': 1 + } + }] + + +def main(): + total_parser = argparse.ArgumentParser("Summary Task") + total_parser.add_argument( + '--do_eval_only', action='store_true', default=False) + total_parser.add_argument( + '--pretrained_model_path', default=None, type=str) + total_parser.add_argument('--output_save_path', + default='./predict.json', type=str) + # * Args for data preprocessing + total_parser = GPT2QADataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = GPT2FinetuneMedicalQAModelCheckpoint.add_argparse_args( + total_parser) + total_parser = GPT2FinetuneMedicalQA.add_model_specific_args(total_parser) + # * Args for base model + args = total_parser.parse_args() + + data_model = GPT2QADataModel(args) + if not args.do_eval_only: + model = GPT2FinetuneMedicalQA(args, len(data_model.train_dataloader())) + checkpoint_callback = GPT2FinetuneMedicalQAModelCheckpoint( + args).callbacks + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'log/'), name='MedicalQA-GPT2') + trainer = Trainer.from_argparse_args(args, + logger=logger, + callbacks=[checkpoint_callback] + ) + trainer.fit(model, data_model) + + # result = trainer.predict(model, data_model) + # with open('test_results.txt', 'wt', encoding='utf-8') as w: + # for line in result: + # w.writelines(line) + + model.model.save_pretrained( + '/cognitive_comp/wuziwei/pretrained_model_hf') + else: + print('save to hf.....') + trainer = Trainer.from_argparse_args(args) + model = GPT2FinetuneMedicalQA( + args, len(data_model.predict_dataloader())) + + result = trainer.predict( + model, data_model, ckpt_path='/cognitive_comp/wuziwei/task/fs_medical_qa_finetune/ckpt/last.ckpt') + # with open('test_results.txt','wt',encoding='utf-8') as w: + # for line in result: + # w.writelines(line) + + model.model.save_pretrained( + '/cognitive_comp/wuziwei/pretrained_model_hf') + + +if __name__ == '__main__': + main() diff --git a/fengshen/examples/wenzhong_qa/finetune_wenzhong.py b/fengshen/examples/wenzhong_qa/finetune_wenzhong.py new file mode 100644 index 0000000000000000000000000000000000000000..bcdeda71fd2d2d70dd56148451ddf2d4946bf31c --- /dev/null +++ b/fengshen/examples/wenzhong_qa/finetune_wenzhong.py @@ -0,0 +1,153 @@ +# sys.path.append('./') +import os +import torch +import argparse +import pytorch_lightning as pl +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning import Trainer, loggers +from transformers.optimization import get_linear_schedule_with_warmup +from transformers import GPT2LMHeadModel +from fengshen.data.task_dataloader.medicalQADataset import GPT2QADataModel + + +class GPT2FinetuneMedicalQAModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./ckpt/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + parser.add_argument('--save_last', action='store_true', default=True) + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename, + save_last=args.save_last) + + +class GPT2FinetuneMedicalQA(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--learning_rate', default=1e-4, type=float) + parser.add_argument('--weight_decay', default=0.1, type=float) + parser.add_argument('--warmup', default=0.01, type=float) + return parent_args + + def __init__(self, args, num_data): + super().__init__() + self.args = args + self.num_data = num_data + print('num_data:', num_data) + self.model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path) + + def setup(self, stage) -> None: + if stage == 'fit': + num_gpus = self.trainer.gpus if self.trainer.gpus is not None else 0 + self.total_step = int(self.trainer.max_epochs * self.num_data + / (max(1, num_gpus) * self.trainer.accumulate_grad_batches)) + print('Total training step:', self.total_step) + + def training_step(self, batch, batch_idx): + output = self.model( + input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels']) + # output = self.model(input_ids=batch['input_ids'], labels=batch['labels']) + # acc = self.comput_metrix(output.logits, batch['labels']) + self.log('train_loss', output.loss) + return output.loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float()) / labels.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + output = self.model( + input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels']) + # output = self.model(input_ids=batch['input_ids'], labels=batch['labels']) + # acc = self.comput_metrix(output.logits, batch['labels']) + self.log('val_loss', output.loss) + # self.log('val_acc', acc) + + def configure_optimizers(self): + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + paras = list( + filter(lambda p: p[1].requires_grad, self.named_parameters())) + paras = [{ + 'params': + [p for n, p in paras if not any(nd in n for nd in no_decay)], + 'weight_decay': self.args.weight_decay + }, { + 'params': [p for n, p in paras if any(nd in n for nd in no_decay)], + 'weight_decay': 0.0 + }] + optimizer = torch.optim.AdamW(paras, lr=self.args.learning_rate) + scheduler = get_linear_schedule_with_warmup( + optimizer, int(self.total_step * self.args.warmup), + self.total_step) + + return [{ + 'optimizer': optimizer, + 'lr_scheduler': { + 'scheduler': scheduler, + 'interval': 'step', + 'frequency': 1 + } + }] + + +def main(): + total_parser = argparse.ArgumentParser("QA Task") + total_parser.add_argument('--do_eval_only', action='store_true', default=False) + total_parser.add_argument('--pretrained_model_path', default='google/mt5-small', type=str) + total_parser.add_argument('--output_save_path', default='./predict.json', type=str) + # * Args for data preprocessing + total_parser = GPT2QADataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = Trainer.add_argparse_args(total_parser) + total_parser = GPT2FinetuneMedicalQAModelCheckpoint.add_argparse_args(total_parser) + total_parser = GPT2FinetuneMedicalQA.add_model_specific_args(total_parser) + # * Args for base model + args = total_parser.parse_args() + + data_model = GPT2QADataModel(args) + if not args.do_eval_only: + model = GPT2FinetuneMedicalQA(args, len(data_model.train_dataloader())) + checkpoint_callback = GPT2FinetuneMedicalQAModelCheckpoint(args).callbacks + logger = loggers.TensorBoardLogger(save_dir=os.path.join( + args.default_root_dir, 'log/'), name='WenZhong') + trainer = Trainer.from_argparse_args(args, + logger=logger, + callbacks=[checkpoint_callback] + ) + trainer.fit(model, data_model) + + +if __name__ == '__main__': + main() + # test() + +''' +# python examples/mt5_summary.py --gpus=1 --test_data=test_public.jsonl +# --default_root_dir=/cognitive_comp/ganruyi/fengshen/mt5_summary/eval +# --do_eval_only +# --resume_from_checkpoint=/cognitive_comp/ganruyi/fengshen/mt5_summary/ckpt/model-epoch=01-train_loss=1.9166.ckpt +# --strategy=ddp +''' diff --git a/fengshen/examples/wenzhong_qa/finetune_wenzhong.sh b/fengshen/examples/wenzhong_qa/finetune_wenzhong.sh new file mode 100644 index 0000000000000000000000000000000000000000..0100377bf5c54c0eba3088e3b09368a5b31f9c06 --- /dev/null +++ b/fengshen/examples/wenzhong_qa/finetune_wenzhong.sh @@ -0,0 +1,126 @@ +#!/bin/bash +#SBATCH --job-name=finetune_wenzhong +#SBATCH --cpus-per-task=50 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --gres=gpu:1 # number of gpus +#SBATCH -o %x-%j.log +#SBATCH -e %x-%j.err + +set -x -e + +export MASTER_PORT=$[RANDOM%10000+50000] +export TORCH_EXTENSIONS_DIR=/cognitive_comp/gaoxinyu/torch_extendsions + +echo "START TIME: $(date)" +MICRO_BATCH_SIZE=1 +ROOT_DIR=/cognitive_comp/gaoxinyu/FS/fengshen/fengshen + +ZERO_STAGE=3 + +config_json="$ROOT_DIR/ds_config.$SLURM_JOBID.json" +#config_json="$ROOT_DIR/ds_config.wzw.json" +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu":1, + "steps_per_print":100, + "gradient_clipping":1, + "zero_optimization":{ + "stage": $ZERO_STAGE, + "offload_optimizer":{ + "device":"cpu", + "pin_memory":true + }, + "offload_param":{ + "device":"cpu", + "pin_memory":true + }, + "overlap_comm":true, + "contiguous_gradients":true, + "sub_group_size":1000000000, + "stage3_max_live_parameters":1000000000, + "stage3_max_reuse_distance":1000000000, + "stage3_gather_fp16_weights_on_model_save":true + }, + "optimizer":{ + "type":"Adam", + "params":{ + "lr": 1e-5, + "weight_decay":0.01 + } + }, + "scheduler":{ + "type":"WarmupLR", + "params":{ + "warmup_min_lr":5e-6, + "warmup_max_lr":1e-5 + } + }, + "zero_allow_untested_optimizer":false, + "fp16":{ + "enabled":true, + "loss_scale":0, + "loss_scale_window":1000, + "hysteresis":2, + "min_loss_scale":1 + }, + "activation_checkpointing":{ + "partition_activations":false, + "contiguous_memory_optimization":false + }, + "wall_clock_breakdown":false +} +EOT + +export PL_DEEPSPEED_CONFIG_PATH=$config_json + +TRAINER_ARGS=" + --max_epochs 2 \ + --gpus 1 \ + --num_nodes 1 \ + --strategy deepspeed_stage_3 \ + --precision 16 \ + --default_root_dir $ROOT_DIR \ + --dirpath $ROOT_DIR/ckpt \ + --save_top_k 3 \ + --monitor train_loss \ + --mode min \ + --save_last \ +" +DATA_DIR=/cognitive_comp/gaoxinyu/data/yuyuan +DATA_ARGS=" + --data_dir $DATA_DIR \ + --train_batchsize $MICRO_BATCH_SIZE \ + --valid_batchsize $MICRO_BATCH_SIZE \ + --train_data train.txt \ + --valid_data valid.txt \ + --test_data test.txt +" + +MODEL_ARGS=" + --pretrained_model_path /cognitive_comp/gaoxinyu/hf_model/wenzhong \ + --output_save_path $ROOT_DIR/predict.json \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --warmup 0.01 \ +" + +SCRIPTS_PATH=/cognitive_comp/gaoxinyu/FS/fengshen/finetune_wenzhong.py + +export CMD=" \ + $SCRIPTS_PATH \ + $TRAINER_ARGS \ + $MODEL_ARGS \ + $DATA_ARGS \ + " + +echo $CMD + +SINGULARITY_PATH=/cognitive_comp/gaoxinyu/docker/pytorch21_06_py3_docker_image_v2.sif + +# to debug - add echo (it exits and prints what it would have launched) +#run_cmd="$PY_LAUNCHER $CMD" + +clear; srun --jobid $SLURM_JOBID singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH bash -c 'python $CMD' +# bash -c 'python $CMD' \ No newline at end of file diff --git a/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py b/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py new file mode 100644 index 0000000000000000000000000000000000000000..1404571159ea95776c3953fdecb28a84031c1347 --- /dev/null +++ b/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py @@ -0,0 +1,610 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from fengshen.models.zen1.tokenization import BertTokenizer +from fengshen.models.zen1.modeling import ZenForSequenceClassification +from fengshen.models.zen1.ngram_utils import ZenNgramDict +from pytorch_lightning.callbacks import LearningRateMonitor +import csv +from dataclasses import dataclass +import logging +import math +import numpy as np +import os +from tqdm import tqdm +import json +import torch +import pytorch_lightning as pl +from random import shuffle +import argparse +from pytorch_lightning.callbacks import ModelCheckpoint +from torch.utils.data import Dataset, DataLoader + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, ngram_ids, ngram_positions, ngram_lengths, + ngram_tuples, ngram_seg_ids, ngram_masks): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + + self.ngram_ids = ngram_ids + self.ngram_positions = ngram_positions + self.ngram_lengths = ngram_lengths + self.ngram_tuples = ngram_tuples + self.ngram_seg_ids = ngram_seg_ids + self.ngram_masks = ngram_masks + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_examples(self, data_path, mode): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + # if sys.version_info[0] == 2: + # line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + @classmethod + def _read_json(cls, input_file): + """Reads a jsonl file.""" + with open(input_file, "r", encoding="utf-8") as f: + lines = f.readlines() + samples = [] + for line in tqdm(lines): + data = json.loads(line) + samples.append(data) + return samples + + +class TnewsProcessor(DataProcessor): + """Processor for the tnews data set (HIT version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_examples(self, data_path, mode): + return self._create_examples( + self._read_json(data_path), + set_type=mode + ) + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # if i == 0: + # continue + guid = "%s-%s" % (set_type, i) + # text_a = line[0] + text_a = line['sentence'] + label = line['label'] if 'label' in line.keys() else None + examples.append( + InputExample(guid=guid, text_a=text_a, label=label)) + return examples + + +class OcnliProcessor(DataProcessor): + """Processor for the ocnli or cmnli data set (HIT version).""" + + def get_examples(self, data_path, mode): + return self._create_examples( + self._read_json(data_path), + set_type=mode + ) + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # if i == 0: + # continue + guid = "%s-%s" % (set_type, i) + # text_a = line[0] + text_a = line['sentence1'] + text_b = line['sentence2'] + label = line['label'] if 'label' in line.keys() else None + # 特殊处理,cmnli有label为-的 + if label == '-': + label = None + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class IflytekProcessor(DataProcessor): + """Processor for the iflytek data set (HIT version).""" + + def get_examples(self, data_path, mode): + return self._create_examples( + self._read_json(data_path), + set_type=mode + ) + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # if i == 0: + # continue + guid = "%s-%s" % (set_type, i) + # text_a = line[0] + text_a = line['sentence'] + label = line['label'] if 'label' in line.keys() else None + examples.append( + InputExample(guid=guid, text_a=text_a, label=label)) + return examples + + +def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict): + """Loads a data file into a list of `InputBatch`s.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + + # ----------- code for ngram BEGIN----------- + ngram_matches = [] + # Filter the word segment from 2 to 7 to check whether there is a word + for p in range(2, 8): + for q in range(0, len(tokens) - p + 1): + character_segment = tokens[q:q + p] + # j is the starting position of the word + # i is the length of the current word + character_segment = tuple(character_segment) + if character_segment in ngram_dict.ngram_to_id_dict: + ngram_index = ngram_dict.ngram_to_id_dict[character_segment] + ngram_matches.append([ngram_index, q, p, character_segment]) + + shuffle(ngram_matches) + # max_word_in_seq_proportion = max_word_in_seq + max_word_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq) + if len(ngram_matches) > max_word_in_seq_proportion: + ngram_matches = ngram_matches[:max_word_in_seq_proportion] + ngram_ids = [ngram[0] for ngram in ngram_matches] + ngram_positions = [ngram[1] for ngram in ngram_matches] + ngram_lengths = [ngram[2] for ngram in ngram_matches] + ngram_tuples = [ngram[3] for ngram in ngram_matches] + ngram_seg_ids = [0 if position < (len(tokens_a) + 2) else 1 for position in ngram_positions] + + ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool) + ngram_mask_array[:len(ngram_ids)] = 1 + + # record the masked positions + ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32) + for i in range(len(ngram_ids)): + ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = 1.0 + + # Zero-pad up to the max word in seq length. + padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids)) + ngram_ids += padding + ngram_lengths += padding + ngram_seg_ids += padding + + # ----------- code for ngram END----------- + label_id = label_map[example.label] if example.label is not None else 0 + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + ngram_ids=ngram_ids, + ngram_positions=ngram_positions_matrix, + ngram_lengths=ngram_lengths, + ngram_tuples=ngram_tuples, + ngram_seg_ids=ngram_seg_ids, + ngram_masks=ngram_mask_array)) + + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +class TaskDataset(Dataset): + def __init__(self, data_path, processor, mode='train'): + super().__init__() + self.data = self.load_data(data_path, processor, mode) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.data[index] + + def load_data(self, data_path, processor, mode): + if mode == "train": + examples = processor.get_examples(data_path, mode) + elif mode == "test": + examples = processor.get_examples(data_path, mode) + elif mode == "dev": + examples = processor.get_examples(data_path, mode) + return examples + + +@dataclass +class TaskCollator: + args = None + tokenizer = None + ngram_dict = None + label2id = None + + def __call__(self, samples): + features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict) + # logger.info(" Num examples = %d", len(samples)) + input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long) + ngram_positions = torch.tensor([f.ngram_positions for f in features], dtype=torch.long) + # ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long) + # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long) + # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long) + + return { + 'input_ids': input_ids, + 'input_ngram_ids': ngram_ids, + 'ngram_position_matrix': ngram_positions, + 'attention_mask': input_mask, + 'token_type_ids': segment_ids, + 'labels': label_ids, + + } + # return default_collate(sample_list) + + +class TaskDataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('TASK NAME DataModel') + parser.add_argument('--data_dir', default='./data', type=str) + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--train_data', default='train.json', type=str) + parser.add_argument('--valid_data', default='dev.json', type=str) + parser.add_argument('--test_data', default='test.json', type=str) + parser.add_argument('--train_batchsize', default=16, type=int) + parser.add_argument('--valid_batchsize', default=32, type=int) + parser.add_argument('--max_seq_length', default=128, type=int) + + parser.add_argument('--texta_name', default='text', type=str) + parser.add_argument('--textb_name', default='sentence2', type=str) + parser.add_argument('--label_name', default='label', type=str) + parser.add_argument('--id_name', default='id', type=str) + + parser.add_argument('--dataset_name', default=None, type=str) + parser.add_argument('--vocab_file', + type=str, default=None, + help="Vocabulary mapping/file BERT was pretrainined on") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument('--task_name', default='tnews', type=str) + + return parent_args + + def __init__(self, args): + super().__init__() + self.train_batchsize = args.train_batchsize + self.valid_batchsize = args.valid_batchsize + self.collator = TaskCollator() + self.collator.args = args + self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) + self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer) + + processors = { + 'afqmc': OcnliProcessor, + 'tnews': TnewsProcessor, + 'ocnli': OcnliProcessor, + 'cmnli': OcnliProcessor, + 'iflytek': IflytekProcessor, + } + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + if args.dataset_name is None: + self.label2id, self.id2label = self.load_schema(os.path.join( + args.data_dir, args.train_data), args) + self.train_data = TaskDataset(os.path.join( + args.data_dir, args.train_data), processor, mode='train') + self.valid_data = TaskDataset(os.path.join( + args.data_dir, args.valid_data), processor, mode='dev') + self.test_data = TaskDataset(os.path.join( + args.data_dir, args.test_data), processor, mode='test') + self.collator.label2id = self.label2id + else: + import datasets + ds = datasets.load_dataset(args.dataset_name) + self.train_data = ds['train'] + self.valid_data = ds['validation'] + self.test_data = ds['test'] + self.save_hyperparameters(args) + + def train_dataloader(self): + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False, + collate_fn=self.collator) + + def val_dataloader(self): + return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def predict_dataloader(self): + return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def load_schema(self, data_path, args): + with open(data_path, 'r', encoding='utf8') as f: + lines = f.readlines() + label_list = [] + for line in tqdm(lines): + data = json.loads(line) + labels = data[args.label_name] if args.label_name in data.keys( + ) else 0 + if labels not in label_list: + label_list.append(labels) + + label2id, id2label = {}, {} + for i, k in enumerate(label_list): + label2id[k] = i + id2label[i] = k + return label2id, id2label + + +class LitModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--num_labels', default=2, type=int) + + return parent_args + + def __init__(self, args): + super().__init__() + self.model = ZenForSequenceClassification.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) + self.save_hyperparameters(args) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def training_step(self, batch, batch_idx): + loss, logits = self.model(**batch) + acc = self.comput_metrix(logits, batch['labels']) + self.log('train_loss', loss) + self.log('train_acc', acc) + return loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + loss, logits = self.model(**batch) + acc = self.comput_metrix(logits, batch['labels']) + self.log('val_loss', loss) + self.log('val_acc', acc) + + def predict_step(self, batch, batch_idx): + output = self.model(**batch) + return output.logits + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + +class TaskModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./log/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename) + + +def save_test(data, args, data_model): + with open(args.output_save_path, 'w', encoding='utf-8') as f: + idx = 0 + for i in range(len(data)): + batch = data[i] + for sample in batch: + tmp_result = dict() + label_id = np.argmax(sample.numpy()) + tmp_result['id'] = data_model.test_data.data[idx]['id'] + tmp_result['label'] = data_model.id2label[label_id] + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data+'\n') + idx += 1 + print('save the result to '+args.output_save_path) + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser.add_argument('--pretrained_model_path', default='', type=str) + total_parser.add_argument('--output_save_path', + default='./predict.json', type=str) + # * Args for data preprocessing + total_parser = TaskDataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = pl.Trainer.add_argparse_args(total_parser) + total_parser = TaskModelCheckpoint.add_argparse_args(total_parser) + + # * Args for base model + from fengshen.models.model_utils import add_module_args + total_parser = add_module_args(total_parser) + total_parser = LitModel.add_model_specific_args(total_parser) + + args = total_parser.parse_args() + + checkpoint_callback = TaskModelCheckpoint(args).callbacks + lr_monitor = LearningRateMonitor(logging_interval='step') + trainer = pl.Trainer.from_argparse_args(args, + callbacks=[checkpoint_callback, lr_monitor] + ) + + data_model = TaskDataModel(args) + model = LitModel(args) + trainer.fit(model, data_model) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py b/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb77bbe0edf675300614982466e802964f8c625 --- /dev/null +++ b/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py @@ -0,0 +1,647 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from fengshen.models.zen1.ngram_utils import ZenNgramDict +from fengshen.models.zen1.modeling import ZenForTokenClassification +from fengshen.metric.metric import SeqEntityScore +from fengshen.models.zen1.tokenization import BertTokenizer +from random import shuffle +from pytorch_lightning.callbacks import LearningRateMonitor +from dataclasses import dataclass +import logging +import math +import numpy as np +import os +import json +import torch +import pytorch_lightning as pl +import argparse +from pytorch_lightning.callbacks import ModelCheckpoint +from torch.utils.data import Dataset, DataLoader + +import torch.nn.functional as F +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.ERROR) +logger = logging.getLogger(__name__) + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, ngram_ids, ngram_positions, ngram_lengths, + ngram_tuples, ngram_seg_ids, ngram_masks, valid_ids=None, label_mask=None): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.valid_ids = valid_ids + self.label_mask = label_mask + + self.ngram_ids = ngram_ids + self.ngram_positions = ngram_positions + self.ngram_lengths = ngram_lengths + self.ngram_tuples = ngram_tuples + self.ngram_seg_ids = ngram_seg_ids + self.ngram_masks = ngram_masks + + +def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict): + """Loads a data file into a list of `InputBatch`s.""" + + # label_map = {label: i for i, label in enumerate(label_list, 1)} + + features = [] + for (ex_index, example) in enumerate(examples): + textlist = example.text_a + labellist = example.label + tokens = [] + labels = [] + valid = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + valid.append(1) + label_mask.append(1) + else: + valid.append(0) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + valid = valid[0:(max_seq_length - 2)] + label_mask = label_mask[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + valid.insert(0, 1) + label_mask.insert(0, 1) + label_ids.append(label_map["[CLS]"]) + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + if len(labels) > i: + label_ids.append(label_map[labels[i]]) + ntokens.append("[SEP]") + segment_ids.append(0) + valid.append(1) + label_mask.append(1) + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + label_mask = [1] * len(label_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + label_ids.append(0) + valid.append(1) + label_mask.append(0) + while len(label_ids) < max_seq_length: + label_ids.append(0) + label_mask.append(0) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(valid) == max_seq_length + assert len(label_mask) == max_seq_length + + # ----------- code for ngram BEGIN----------- + ngram_matches = [] + # Filter the ngram segment from 2 to 7 to check whether there is a ngram + for p in range(2, 8): + for q in range(0, len(tokens) - p + 1): + character_segment = tokens[q:q + p] + # j is the starting position of the ngram + # i is the length of the current ngram + character_segment = tuple(character_segment) + if character_segment in ngram_dict.ngram_to_id_dict: + ngram_index = ngram_dict.ngram_to_id_dict[character_segment] + ngram_matches.append([ngram_index, q, p, character_segment]) + + shuffle(ngram_matches) + + max_ngram_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq) + if len(ngram_matches) > max_ngram_in_seq_proportion: + ngram_matches = ngram_matches[:max_ngram_in_seq_proportion] + + ngram_ids = [ngram[0] for ngram in ngram_matches] + ngram_positions = [ngram[1] for ngram in ngram_matches] + ngram_lengths = [ngram[2] for ngram in ngram_matches] + ngram_tuples = [ngram[3] for ngram in ngram_matches] + ngram_seg_ids = [0 if position < (len(tokens) + 2) else 1 for position in ngram_positions] + + ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool) + ngram_mask_array[:len(ngram_ids)] = 1 + + # record the masked positions + ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32) + for i in range(len(ngram_ids)): + ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = 1.0 + + # Zero-pad up to the max ngram in seq length. + padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids)) + ngram_ids += padding + ngram_lengths += padding + ngram_seg_ids += padding + + # ----------- code for ngram END----------- + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_ids, + ngram_ids=ngram_ids, + ngram_positions=ngram_positions_matrix, + ngram_lengths=ngram_lengths, + ngram_tuples=ngram_tuples, + ngram_seg_ids=ngram_seg_ids, + ngram_masks=ngram_mask_array, + valid_ids=valid, + label_mask=label_mask)) + return features + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_examples(self, data_path, set_type, quotechar=' '): + """See base class.""" + return self._create_examples( + self._read_tsv(data_path, self.get_quotechar()), set_type) + + def _create_examples(self, lines, set_type): + examples = [] + for i, (sentence, label) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = sentence + label = label + examples.append(InputExample(guid=guid, text_a=text_a, label=label)) + return examples + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + def get_quotechar(self): + return ' ' + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + ''' + read file + return format : + [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ] + ''' + f = open(input_file) + data = [] + sentence = [] + label = [] + for line in f: + if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n": + if len(sentence) > 0: + data.append((sentence, label)) + sentence = [] + label = [] + continue + splits = line.split(quotechar) + sentence.append(splits[0]) + label.append(splits[-1][:-1]) + + if len(sentence) > 0: + data.append((sentence, label)) + sentence = [] + label = [] + return data + + +class MSRAProcessor(DataProcessor): + """Processor for the msra data set.""" + + def get_labels(self): + return ['B-NR', 'B-NS', 'B-NT', 'E-NR', 'E-NS', 'E-NT', 'M-NR', + 'M-NS', 'M-NT', 'O', 'S-NR', 'S-NS', 'S-NT', '[CLS]', '[SEP]'] + + +class OntoNotes4Processor(DataProcessor): + """Processor for the OntoNotes4 data set.""" + + def get_labels(self): + return ['B-GPE', 'B-LOC', 'B-ORG', 'B-PER', 'E-GPE', 'E-LOC', + 'E-ORG', 'E-PER', 'M-GPE', 'M-LOC', 'M-ORG', 'M-PER', 'O', + 'S-GPE', 'S-LOC', 'S-ORG', 'S-PER', '[CLS]', '[SEP]'] + + +class WeiboProcessor(DataProcessor): + """Processor for the Weibo data set.""" + + def get_labels(self): + return ['B-GPE.NAM', 'B-GPE.NOM', 'B-LOC.NAM', 'B-LOC.NOM', + 'B-ORG.NAM', 'B-ORG.NOM', 'B-PER.NAM', 'B-PER.NOM', 'E-GPE.NAM', + 'E-GPE.NOM', 'E-LOC.NAM', 'E-LOC.NOM', 'E-ORG.NAM', 'E-ORG.NOM', + 'E-PER.NAM', 'E-PER.NOM', 'M-GPE.NAM', 'M-LOC.NAM', 'M-LOC.NOM', + 'M-ORG.NAM', 'M-ORG.NOM', 'M-PER.NAM', 'M-PER.NOM', 'O', + 'S-GPE.NAM', 'S-LOC.NOM', 'S-PER.NAM', 'S-PER.NOM', '[CLS]', '[SEP]'] + + +class ResumeProcessor(DataProcessor): + """Processor for the resume data set.""" + + def get_labels(self): + return ['B-CONT', 'B-EDU', 'B-LOC', 'B-NAME', 'B-ORG', 'B-PRO', + 'B-RACE', 'B-TITLE', 'E-CONT', 'E-EDU', 'E-LOC', 'E-NAME', + 'E-ORG', 'E-PRO', 'E-RACE', 'E-TITLE', 'M-CONT', 'M-EDU', + 'M-LOC', 'M-NAME', 'M-ORG', 'M-PRO', 'M-RACE', 'M-TITLE', + 'O', 'S-NAME', 'S-ORG', 'S-RACE', '[CLS]', '[SEP]'] + + +class CMeEEProcessor(DataProcessor): + """Processor for the CMeEE data set.""" + + def get_quotechar(self): + return '\t' + + def get_labels(self): + return ['B-临床表现', 'B-医学检验项目', 'B-医疗程序', 'B-医疗设备', + 'B-微生物类', 'B-疾病', 'B-科室', 'B-药物', 'B-身体', 'I-临床表现', + 'I-医学检验项目', 'I-医疗程序', 'I-医疗设备', 'I-微生物类', + 'I-疾病', 'I-科室', 'I-药物', 'I-身体', 'O', '[CLS]', '[SEP]'] + + +class CLUENERProcessor(DataProcessor): + """Processor for the CLUENER data set.""" + + def get_quotechar(self): + return '\t' + + def get_labels(self): + return ['B-书名', 'B-公司', 'B-地址', 'B-姓名', 'B-政府', 'B-景点', + 'B-游戏', 'B-电影', 'B-组织机构', 'B-职位', 'I-书名', 'I-公司', + 'I-地址', 'I-姓名', 'I-政府', 'I-景点', 'I-游戏', 'I-电影', + 'I-组织机构', 'I-职位', 'O', '[CLS]', '[SEP]'] + + +class TaskDataset(Dataset): + def __init__(self, data_path, processor, mode='train'): + super().__init__() + self.data = self.load_data(data_path, processor, mode) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.data[index] + + def load_data(self, data_path, processor, mode): + if mode == "train": + examples = processor.get_examples(data_path, mode) + elif mode == "test": + examples = processor.get_examples(data_path, mode) + elif mode == "dev": + examples = processor.get_examples(data_path, mode) + return examples + + +@dataclass +class TaskCollator: + args = None + tokenizer = None + ngram_dict = None + label2id = None + + def __call__(self, samples): + features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict) + # logger.info(" Num examples = %d", len(samples)) + + input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + valid_ids = torch.tensor([f.valid_ids for f in features], dtype=torch.long) + + ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long) + ngram_positions = torch.tensor([f.ngram_positions for f in features], dtype=torch.long) + # ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long) + # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long) + # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long) + + # label_mask = torch.tensor([f.label_mask for f in features], dtype=torch.long) + return { + 'input_ids': input_ids, + 'ngram_ids': ngram_ids, + 'ngram_positions': ngram_positions, + 'attention_mask': input_mask, + 'token_type_ids': segment_ids, + 'labels': label_ids, + 'valid_ids': valid_ids, + } + + +class TaskDataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('TASK NAME DataModel') + parser.add_argument('--data_dir', default='./data', type=str) + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--train_data', default='train.json', type=str) + parser.add_argument('--valid_data', default='dev.json', type=str) + parser.add_argument('--test_data', default='test.json', type=str) + parser.add_argument('--train_batchsize', default=16, type=int) + parser.add_argument('--valid_batchsize', default=32, type=int) + parser.add_argument('--max_seq_length', default=128, type=int) + + parser.add_argument('--texta_name', default='text', type=str) + parser.add_argument('--textb_name', default='sentence2', type=str) + parser.add_argument('--label_name', default='label', type=str) + parser.add_argument('--id_name', default='id', type=str) + + parser.add_argument('--dataset_name', default=None, type=str) + parser.add_argument('--vocab_file', + type=str, default=None, + help="Vocabulary mapping/file BERT was pretrainined on") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument('--task_name', default='weibo', type=str) + + return parent_args + + def __init__(self, args): + super().__init__() + self.train_batchsize = args.train_batchsize + self.valid_batchsize = args.valid_batchsize + self.collator = TaskCollator() + self.collator.args = args + self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) + self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer) + + processors = { + 'weibo': WeiboProcessor, + 'resume': ResumeProcessor, + 'msra': MSRAProcessor, + 'ontonotes4': OntoNotes4Processor, + 'cmeee': CMeEEProcessor, + 'cluener': CLUENERProcessor, + } + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + # 生成id映射 + label_list = processor.get_labels() + label2id = {label: i for i, label in enumerate(label_list, 1)} + label2id["[PAD]"] = 0 + self.id2label = {v: k for k, v in label2id.items()} + self.collator.label2id = label2id + + if args.dataset_name is None: + self.train_data = TaskDataset(os.path.join( + args.data_dir, args.train_data), processor, mode='train') + self.valid_data = TaskDataset(os.path.join( + args.data_dir, args.valid_data), processor, mode='dev') + self.test_data = TaskDataset(os.path.join( + args.data_dir, args.test_data), processor, mode='test') + + else: + import datasets + ds = datasets.load_dataset(args.dataset_name) + self.train_data = ds['train'] + self.valid_data = ds['validation'] + self.test_data = ds['test'] + self.save_hyperparameters(args) + + def train_dataloader(self): + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False, + collate_fn=self.collator) + + def val_dataloader(self): + return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def predict_dataloader(self): + return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + +class LitModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--markup', default='bios', type=str) + parser.add_argument('--middle_prefix', default='I-', type=str) + return parent_args + + def __init__(self, args, id2label): + super().__init__() + # config = ZenConfig(os.path.join(args.pretrained_model_path, 'config.json')) + self.model = ZenForTokenClassification.from_pretrained(args.pretrained_model_path, num_labels=len(id2label)) + self.seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix) + self.train_seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix) + self.id2label = id2label + self.label2id = {v: k for k, v in id2label.items()} + self.save_hyperparameters(args) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def training_step(self, batch, batch_idx): + outputs = self.model(**batch) + loss, _ = outputs + # logits = outputs.logits + # preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2) + # preds = preds.detach().cpu().numpy() + # labels = batch['labels'].detach().cpu().numpy() + # num_labels = len(self.label2id) + # y_true = [] + # y_pred = [] + # for i, label in enumerate(labels): + # temp_1 = [] + # temp_2 = [] + # for j, m in enumerate(label): + # if j == 0: + # continue + # elif labels[i][j] == num_labels - 1: + # y_true.append(temp_1) + # y_pred.append(temp_2) + # break + # else: + # temp_1.append(self.id2label[labels[i][j]]) + # temp_2.append(self.id2label[preds[i][j]]) + + # self.train_seq_entity_score.update(y_true, y_pred) + # result = self.train_seq_entity_score.result() + # self.train_seq_entity_score.reset() + self.log('train_loss', loss) + + return loss + + def validation_step(self, batch, batch_idx): + outputs = self.model(**batch) + loss, logits = outputs + preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2) + preds = preds.detach().cpu().numpy() + labels = batch['labels'].detach().cpu().numpy() + num_labels = len(self.label2id) + y_true = [] + y_pred = [] + for i, label in enumerate(labels): + temp_1 = [] + temp_2 = [] + for j, m in enumerate(label): + if j == 0: + continue + elif labels[i][j] == num_labels - 1: + y_true.append(temp_1) + y_pred.append(temp_2) + break + else: + temp_1.append(self.id2label[labels[i][j]]) + temp_2.append(self.id2label[preds[i][j]]) + + self.seq_entity_score.update(y_true, y_pred) + self.log('val_loss', loss) + + def validation_epoch_end(self, outputs): + # compute metric for all process + score_dict, _ = self.seq_entity_score.result() + if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0: + print('score_dict:\n', score_dict) + # reset the metric after once validation + self.seq_entity_score.reset() + for k, v in score_dict.items(): + self.log('val_{}'.format(k), v) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + +class TaskModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./log/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename) + + +def save_test(data, args, data_model): + with open(args.output_save_path, 'w', encoding='utf-8') as f: + idx = 0 + for i in range(len(data)): + batch = data[i] + for sample in batch: + tmp_result = dict() + label_id = np.argmax(sample.numpy()) + tmp_result['id'] = data_model.test_data.data[idx]['id'] + tmp_result['label'] = data_model.id2label[label_id] + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data+'\n') + idx += 1 + print('save the result to '+args.output_save_path) + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser.add_argument('--pretrained_model_path', default='', type=str) + total_parser.add_argument('--output_save_path', + default='./predict.json', type=str) + # * Args for data preprocessing + total_parser = TaskDataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = pl.Trainer.add_argparse_args(total_parser) + total_parser = TaskModelCheckpoint.add_argparse_args(total_parser) + + # * Args for base model + from fengshen.models.model_utils import add_module_args + total_parser = add_module_args(total_parser) + total_parser = LitModel.add_model_specific_args(total_parser) + + args = total_parser.parse_args() + + checkpoint_callback = TaskModelCheckpoint(args).callbacks + lr_monitor = LearningRateMonitor(logging_interval='step') + trainer = pl.Trainer.from_argparse_args(args, + callbacks=[checkpoint_callback, lr_monitor] + ) + + data_model = TaskDataModel(args) + id2label = data_model.id2label + print('id2label:', id2label) + model = LitModel(args, id2label) + trainer.fit(model, data_model) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/zen1_finetune/fs_zen1_tnews.sh b/fengshen/examples/zen1_finetune/fs_zen1_tnews.sh new file mode 100644 index 0000000000000000000000000000000000000000..39f2b54063725514f3fd57fa56346a0796e26828 --- /dev/null +++ b/fengshen/examples/zen1_finetune/fs_zen1_tnews.sh @@ -0,0 +1,95 @@ +#!/bin/bash +#SBATCH --job-name=zen1_tnews # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='1' +export CUDA_LAUNCH_BLOCKING=1 +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen1 + +TASK=tnews + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0 +PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN1-224M-Chinese + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test1.1.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name tnews \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --num_labels 15 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 400 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 400 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen1_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen1_finetune/ner_zen1_ontonotes4.sh b/fengshen/examples/zen1_finetune/ner_zen1_ontonotes4.sh new file mode 100644 index 0000000000000000000000000000000000000000..be51a3f3d709d761b6dcb4e5759cc5b92a09a609 --- /dev/null +++ b/fengshen/examples/zen1_finetune/ner_zen1_ontonotes4.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen1_base_ontonotes4 # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen1_base_ontonotes4/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='1' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen1_base + +TASK=ontonotes4 + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/OntoNotes4/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/ZEN_pretrain_base_v0.1.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.bmes \ + --valid_data test.char.bmes \ + --test_data test.char.bmes \ + --train_batchsize 64 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --task_name ontonotes4 \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 200 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen1_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py b/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py new file mode 100644 index 0000000000000000000000000000000000000000..ed400468cc3d0820d4b34385f270639014039ad1 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py @@ -0,0 +1,649 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from fengshen.models.zen2.modeling import ZenForSequenceClassification +from fengshen.models.zen2.ngram_utils import ZenNgramDict +from fengshen.models.zen2.tokenization import BertTokenizer +from pytorch_lightning.callbacks import LearningRateMonitor +import csv +from dataclasses import dataclass +import logging +import math +import numpy as np +import os +from tqdm import tqdm +import json +import torch +import pytorch_lightning as pl +import argparse +from pytorch_lightning.callbacks import ModelCheckpoint +from torch.utils.data import Dataset, DataLoader + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None, qid=0): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + self.qid = qid + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, + ngram_ids, ngram_starts, ngram_lengths, ngram_tuples, ngram_seg_ids, ngram_masks, ngram_freqs, + qid=-1): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.qid = qid + + self.ngram_ids = ngram_ids + self.ngram_starts = ngram_starts + self.ngram_lengths = ngram_lengths + self.ngram_tuples = ngram_tuples + self.ngram_seg_ids = ngram_seg_ids + self.ngram_masks = ngram_masks + self.ngram_freqs = ngram_freqs + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_examples(self, data_path, mode): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + # if sys.version_info[0] == 2: + # line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + @classmethod + def _read_json(cls, input_file): + """Reads a jsonl file.""" + with open(input_file, "r", encoding="utf-8") as f: + lines = f.readlines() + samples = [] + for line in tqdm(lines): + data = json.loads(line) + samples.append(data) + return samples + + +class TnewsProcessor(DataProcessor): + """Processor for the tnews data set (HIT version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_json(os.path.join(data_dir, "train.json")), "train") + + def get_examples(self, data_path, mode): + return self._create_examples( + self._read_json(data_path), + set_type=mode + ) + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # if i == 0: + # continue + guid = "%s-%s" % (set_type, i) + # text_a = line[0] + text_a = line['sentence'] + label = line['label'] if 'label' in line.keys() else None + examples.append( + InputExample(guid=guid, text_a=text_a, label=label)) + return examples + + +class OcnliProcessor(DataProcessor): + """Processor for the ocnli or cmnli data set (HIT version).""" + + def get_examples(self, data_path, mode): + return self._create_examples( + self._read_json(data_path), + set_type=mode + ) + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # if i == 0: + # continue + guid = "%s-%s" % (set_type, i) + # text_a = line[0] + text_a = line['sentence1'] + text_b = line['sentence2'] + label = line['label'] if 'label' in line.keys() else None + # 特殊处理,cmnli有label为-的 + if label == '-': + label = None + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class IflytekProcessor(DataProcessor): + """Processor for the iflytek data set (HIT version).""" + + def get_examples(self, data_path, mode): + return self._create_examples( + self._read_json(data_path), + set_type=mode + ) + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # if i == 0: + # continue + guid = "%s-%s" % (set_type, i) + # text_a = line[0] + text_a = line['sentence'] + label = line['label'] if 'label' in line.keys() else None + examples.append( + InputExample(guid=guid, text_a=text_a, label=label)) + return examples + + +def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict): + """Loads a data file into a list of `InputBatch`s.""" + + # label_map = {label : i for i, label in enumerate(label_list)} + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambigiously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) + + if tokens_b: + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + # ----------- code for ngram BEGIN----------- + ngram_matches = [] + # Filter the word segment from 2 to max_ngram_len to check whether there is a word + max_gram_n = ngram_dict.max_ngram_len + for p in range(2, max_gram_n): + for q in range(0, len(tokens) - p + 1): + character_segment = tokens[q:q + p] + # j is the starting position of the word + # i is the length of the current word + character_segment = tuple(character_segment) + if character_segment in ngram_dict.ngram_to_id_dict: + ngram_index = ngram_dict.ngram_to_id_dict[character_segment] + ngram_freq = ngram_dict.ngram_to_freq_dict[character_segment] + ngram_matches.append([ngram_index, q, p, character_segment, ngram_freq]) + + # shuffle(ngram_matches) + ngram_matches = sorted(ngram_matches, key=lambda s: s[0]) + # max_word_in_seq_proportion = max_word_in_seq + max_word_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq) + if len(ngram_matches) > max_word_in_seq_proportion: + ngram_matches = ngram_matches[:max_word_in_seq_proportion] + ngram_ids = [ngram[0] for ngram in ngram_matches] + ngram_positions = [ngram[1] for ngram in ngram_matches] + ngram_lengths = [ngram[2] for ngram in ngram_matches] + ngram_tuples = [ngram[3] for ngram in ngram_matches] + ngram_freqs = [ngram[4] for ngram in ngram_matches] + ngram_seg_ids = [0 if position < len([id for id in segment_ids if id == 0]) else 1 for position in + ngram_positions] + + ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool) + ngram_mask_array[:len(ngram_ids)] = 1 + + # Zero-pad up to the max word in seq length. + padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids)) + ngram_ids += padding + ngram_positions += padding + ngram_lengths += padding + ngram_seg_ids += padding + ngram_freqs += padding + + # ----------- code for ngram END----------- + + label_id = label_map[example.label] if example.label is not None else 0 + # if ex_index < 5: + # logger.info("*** Example ***") + # logger.info("guid: %s" % (example.guid)) + # logger.info("tokens: %s" % " ".join( + # [str(x) for x in tokens])) + # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + # logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + # logger.info( + # "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + # logger.info("label: %s (id = %d)" % (example.label, label_id)) + # logger.info("ngram_ids: %s" % " ".join([str(x) for x in ngram_ids])) + # logger.info("ngram_positions: %s" % " ".join([str(x) for x in ngram_positions])) + # logger.info("ngram_lengths: %s" % " ".join([str(x) for x in ngram_lengths])) + # logger.info("ngram_tuples: %s" % " ".join([str(x) for x in ngram_tuples])) + # logger.info("ngram_seg_ids: %s" % " ".join([str(x) for x in ngram_seg_ids])) + # logger.info("ngram_freqs: %s" % " ".join([str(x) for x in ngram_freqs])) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + ngram_ids=ngram_ids, + ngram_starts=ngram_positions, + ngram_lengths=ngram_lengths, + ngram_tuples=ngram_tuples, + ngram_seg_ids=ngram_seg_ids, + ngram_masks=ngram_mask_array, + ngram_freqs=ngram_freqs, + qid=example.qid)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +class TaskDataset(Dataset): + def __init__(self, data_path, processor, mode='train'): + super().__init__() + self.data = self.load_data(data_path, processor, mode) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.data[index] + + def load_data(self, data_path, processor, mode): + if mode == "train": + examples = processor.get_examples(data_path, mode) + elif mode == "test": + examples = processor.get_examples(data_path, mode) + elif mode == "dev": + examples = processor.get_examples(data_path, mode) + return examples + + +@dataclass +class TaskCollator: + args = None + tokenizer = None + ngram_dict = None + label2id = None + + def __call__(self, samples): + features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict) + # logger.info(" Num examples = %d", len(samples)) + input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + # qids = torch.tensor([f.qid for f in features], dtype=torch.long) + + ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long) + ngram_starts = torch.tensor([f.ngram_starts for f in features], dtype=torch.long) + ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long) + # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long) + # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long) + ngram_freqs = torch.tensor([f.ngram_freqs for f in features], dtype=torch.long) + + batch_size = len(samples) + ngram_positions_matrix = torch.zeros( + size=(batch_size, self.args.max_seq_length, self.ngram_dict.max_ngram_in_seq), + dtype=torch.int) + for batch_id in range(batch_size): + ngram_id = ngram_ids[batch_id] + ngram_start = ngram_starts[batch_id] + ngram_length = ngram_lengths[batch_id] + for i in range(len(ngram_id)): + ngram_positions_matrix[batch_id][ngram_start[i]:ngram_start[i] + ngram_length[i], i] = ngram_freqs[batch_id][i] + ngram_positions_matrix[batch_id] \ + = torch.div(ngram_positions_matrix[batch_id], + torch.stack([torch.sum(ngram_positions_matrix[batch_id], 1)] * + ngram_positions_matrix[batch_id].size(1)).t() + 1e-10) + + return { + 'input_ids': input_ids, + 'input_ngram_ids': ngram_ids, + 'ngram_position_matrix': ngram_positions_matrix, + 'attention_mask': input_mask, + 'token_type_ids': segment_ids, + 'labels': label_ids + + } + + # return default_collate(sample_list) + + +class TaskDataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('TASK NAME DataModel') + parser.add_argument('--data_dir', default='./data', type=str) + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--train_data', default='train.json', type=str) + parser.add_argument('--valid_data', default='dev.json', type=str) + parser.add_argument('--test_data', default='test.json', type=str) + parser.add_argument('--train_batchsize', default=16, type=int) + parser.add_argument('--valid_batchsize', default=32, type=int) + parser.add_argument('--max_seq_length', default=128, type=int) + + parser.add_argument('--texta_name', default='text', type=str) + parser.add_argument('--textb_name', default='sentence2', type=str) + parser.add_argument('--label_name', default='label', type=str) + parser.add_argument('--id_name', default='id', type=str) + + parser.add_argument('--dataset_name', default=None, type=str) + parser.add_argument('--vocab_file', + type=str, default=None, + help="Vocabulary mapping/file BERT was pretrainined on") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument('--task_name', default='tnews', type=str) + + return parent_args + + def __init__(self, args): + super().__init__() + self.train_batchsize = args.train_batchsize + self.valid_batchsize = args.valid_batchsize + self.collator = TaskCollator() + self.collator.args = args + self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) + self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer) + + processors = { + 'afqmc': OcnliProcessor, + 'tnews': TnewsProcessor, + 'ocnli': OcnliProcessor, + 'cmnli': OcnliProcessor, + 'iflytek': IflytekProcessor, + } + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + if args.dataset_name is None: + self.label2id, self.id2label = self.load_schema(os.path.join( + args.data_dir, args.train_data), args) + self.train_data = TaskDataset(os.path.join( + args.data_dir, args.train_data), processor, mode='train') + self.valid_data = TaskDataset(os.path.join( + args.data_dir, args.valid_data), processor, mode='dev') + self.test_data = TaskDataset(os.path.join( + args.data_dir, args.test_data), processor, mode='test') + self.collator.label2id = self.label2id + else: + import datasets + ds = datasets.load_dataset(args.dataset_name) + self.train_data = ds['train'] + self.valid_data = ds['validation'] + self.test_data = ds['test'] + self.save_hyperparameters(args) + + def train_dataloader(self): + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False, + collate_fn=self.collator) + + def val_dataloader(self): + return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def predict_dataloader(self): + return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def load_schema(self, data_path, args): + with open(data_path, 'r', encoding='utf8') as f: + lines = f.readlines() + label_list = [] + for line in tqdm(lines): + data = json.loads(line) + labels = data[args.label_name] if args.label_name in data.keys( + ) else 0 + if labels not in label_list: + label_list.append(labels) + + label2id, id2label = {}, {} + for i, k in enumerate(label_list): + label2id[k] = i + id2label[i] = k + return label2id, id2label + + +class LitModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--num_labels', default=2, type=int) + + return parent_args + + def __init__(self, args): + super().__init__() + self.model = ZenForSequenceClassification.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) + self.save_hyperparameters(args) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def training_step(self, batch, batch_idx): + loss, logits = self.model(**batch) + acc = self.comput_metrix(logits, batch['labels']) + self.log('train_loss', loss) + self.log('train_acc', acc) + return loss + + def comput_metrix(self, logits, labels): + y_pred = torch.argmax(logits, dim=-1) + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)).float() + corr = torch.eq(y_pred, y_true) + acc = torch.sum(corr.float())/labels.size()[0] + return acc + + def validation_step(self, batch, batch_idx): + loss, logits = self.model(**batch) + acc = self.comput_metrix(logits, batch['labels']) + self.log('val_loss', loss) + self.log('val_acc', acc) + + def predict_step(self, batch, batch_idx): + output = self.model(**batch) + return output.logits + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + +class TaskModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./log/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename) + + +def save_test(data, args, data_model): + with open(args.output_save_path, 'w', encoding='utf-8') as f: + idx = 0 + for i in range(len(data)): + batch = data[i] + for sample in batch: + tmp_result = dict() + label_id = np.argmax(sample.numpy()) + tmp_result['id'] = data_model.test_data.data[idx]['id'] + tmp_result['label'] = data_model.id2label[label_id] + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data+'\n') + idx += 1 + print('save the result to '+args.output_save_path) + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser.add_argument('--pretrained_model_path', default='', type=str) + total_parser.add_argument('--output_save_path', + default='./predict.json', type=str) + # * Args for data preprocessing + total_parser = TaskDataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = pl.Trainer.add_argparse_args(total_parser) + total_parser = TaskModelCheckpoint.add_argparse_args(total_parser) + + # * Args for base model + from fengshen.models.model_utils import add_module_args + total_parser = add_module_args(total_parser) + total_parser = LitModel.add_model_specific_args(total_parser) + + args = total_parser.parse_args() + + checkpoint_callback = TaskModelCheckpoint(args).callbacks + lr_monitor = LearningRateMonitor(logging_interval='step') + trainer = pl.Trainer.from_argparse_args(args, + callbacks=[checkpoint_callback, lr_monitor] + ) + + data_model = TaskDataModel(args) + model = LitModel(args) + trainer.fit(model, data_model) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py b/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py new file mode 100644 index 0000000000000000000000000000000000000000..619847c1555311226be69d7d0558368dfd048546 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py @@ -0,0 +1,678 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from fengshen.models.zen2.modeling import ZenForTokenClassification +from fengshen.metric.metric import SeqEntityScore +from fengshen.models.zen2.tokenization import BertTokenizer +from fengshen.models.zen2.ngram_utils import ZenNgramDict +from pytorch_lightning.callbacks import LearningRateMonitor +from dataclasses import dataclass +import logging +import math +import numpy as np +import os +import json +import torch +import pytorch_lightning as pl +import argparse +from pytorch_lightning.callbacks import ModelCheckpoint +from torch.utils.data import Dataset, DataLoader + +import torch.nn.functional as F +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.ERROR) +logger = logging.getLogger(__name__) + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id, ngram_ids, ngram_positions, ngram_lengths, + ngram_tuples, ngram_seg_ids, ngram_masks, valid_ids=None, label_mask=None, b_use_valid_filter=False): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.valid_ids = valid_ids + self.label_mask = label_mask + + self.ngram_ids = ngram_ids + self.ngram_positions = ngram_positions + self.ngram_lengths = ngram_lengths + self.ngram_tuples = ngram_tuples + self.ngram_seg_ids = ngram_seg_ids + self.ngram_masks = ngram_masks + + self.b_use_valid_filter = b_use_valid_filter + + +def convert_examples_to_features(examples, label_map, max_seq_length, tokenizer, ngram_dict): + """Loads a data file into a list of `InputBatch`s.""" + + # label_map = {label: i for i, label in enumerate(label_list, 1)} + # label_map["[PAD]"] = 0 + + features = [] + b_use_valid_filter = False + for (ex_index, example) in enumerate(examples): + textlist = example.text_a + labellist = example.label + tokens = [] + labels = [] + valid = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + if len(tokens) + len(token) > max_seq_length - 2: + break + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + valid.append(1) + label_mask.append(1) + else: + valid.append(0) + b_use_valid_filter = True + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + valid.insert(0, 1) + label_mask.insert(0, 1) + label_ids.append(label_map["[CLS]"]) + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + if len(labels) > i: + label_ids.append(label_map[labels[i]]) + ntokens.append("[SEP]") + segment_ids.append(0) + valid.append(1) + label_mask.append(1) + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + label_mask = [1] * len(label_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + label_ids.append(0) + valid.append(1) + label_mask.append(0) + while len(label_ids) < max_seq_length: + label_ids.append(0) + label_mask.append(0) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(valid) == max_seq_length + assert len(label_mask) == max_seq_length + + # ----------- code for ngram BEGIN----------- + ngram_matches = [] + # Filter the ngram segment from 2 to 7 to check whether there is a ngram + max_gram_n = ngram_dict.max_ngram_len + for p in range(2, max_gram_n): + for q in range(0, len(tokens) - p + 1): + character_segment = tokens[q:q + p] + # j is the starting position of the ngram + # i is the length of the current ngram + character_segment = tuple(character_segment) + if character_segment in ngram_dict.ngram_to_id_dict: + ngram_index = ngram_dict.ngram_to_id_dict[character_segment] + ngram_freq = ngram_dict.ngram_to_freq_dict[character_segment] + ngram_matches.append([ngram_index, q, p, character_segment, ngram_freq]) + + ngram_matches = sorted(ngram_matches, key=lambda s: s[0]) + + max_ngram_in_seq_proportion = math.ceil((len(tokens) / max_seq_length) * ngram_dict.max_ngram_in_seq) + if len(ngram_matches) > max_ngram_in_seq_proportion: + ngram_matches = ngram_matches[:max_ngram_in_seq_proportion] + + ngram_ids = [ngram[0] for ngram in ngram_matches] + ngram_positions = [ngram[1] for ngram in ngram_matches] + ngram_lengths = [ngram[2] for ngram in ngram_matches] + ngram_tuples = [ngram[3] for ngram in ngram_matches] + ngram_freqs = [ngram[4] for ngram in ngram_matches] + ngram_seg_ids = [0 if position < (len(tokens) + 2) else 1 for position in ngram_positions] + + ngram_mask_array = np.zeros(ngram_dict.max_ngram_in_seq, dtype=np.bool) + ngram_mask_array[:len(ngram_ids)] = 1 + + # record the masked positions + ngram_positions_matrix = np.zeros(shape=(max_seq_length, ngram_dict.max_ngram_in_seq), dtype=np.int32) + for i in range(len(ngram_ids)): + ngram_positions_matrix[ngram_positions[i]:ngram_positions[i] + ngram_lengths[i], i] = ngram_freqs[i] + ngram_positions_matrix = torch.from_numpy(ngram_positions_matrix.astype(np.float)) + ngram_positions_matrix = torch.div(ngram_positions_matrix, torch.stack( + [torch.sum(ngram_positions_matrix, 1)] * ngram_positions_matrix.size(1)).t() + 1e-10) + ngram_positions_matrix = ngram_positions_matrix.numpy() + + # Zero-pad up to the max ngram in seq length. + padding = [0] * (ngram_dict.max_ngram_in_seq - len(ngram_ids)) + ngram_ids += padding + ngram_lengths += padding + ngram_seg_ids += padding + + # ----------- code for ngram END----------- + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("label: %s (id = %s)" % (",".join([str(x) for x in example.label]), ",".join([str(x) for x in label_ids]))) + logger.info("valid: %s" % " ".join([str(x) for x in valid])) + logger.info("b_use_valid_filter: %s" % str(b_use_valid_filter)) + logger.info("ngram_ids: %s" % " ".join([str(x) for x in ngram_ids])) + logger.info("ngram_positions: %s" % " ".join([str(x) for x in ngram_positions])) + logger.info("ngram_lengths: %s" % " ".join([str(x) for x in ngram_lengths])) + logger.info("ngram_tuples: %s" % " ".join([str(x) for x in ngram_tuples])) + logger.info("ngram_seg_ids: %s" % " ".join([str(x) for x in ngram_seg_ids])) + + features.append( + InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_ids, + ngram_ids=ngram_ids, + ngram_positions=ngram_positions_matrix, + ngram_lengths=ngram_lengths, + ngram_tuples=ngram_tuples, + ngram_seg_ids=ngram_seg_ids, + ngram_masks=ngram_mask_array, + valid_ids=valid, + label_mask=label_mask, + b_use_valid_filter=b_use_valid_filter)) + return features + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_examples(self, data_path, set_type, quotechar=' '): + """See base class.""" + return self._create_examples( + self._read_tsv(data_path, self.get_quotechar()), set_type) + + def _create_examples(self, lines, set_type): + examples = [] + for i, (sentence, label) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = sentence + label = label + examples.append(InputExample(guid=guid, text_a=text_a, label=label)) + return examples + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + def get_quotechar(self): + return ' ' + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + ''' + read file + return format : + [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ] + ''' + f = open(input_file) + data = [] + sentence = [] + label = [] + for line in f: + if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n": + if len(sentence) > 0: + data.append((sentence, label)) + sentence = [] + label = [] + continue + splits = line.split(quotechar) + sentence.append(splits[0]) + label.append(splits[-1][:-1]) + + if len(sentence) > 0: + data.append((sentence, label)) + sentence = [] + label = [] + return data + + +class MSRAProcessor(DataProcessor): + """Processor for the msra data set.""" + + def get_labels(self): + return ['B-NR', 'B-NS', 'B-NT', 'E-NR', 'E-NS', 'E-NT', 'M-NR', + 'M-NS', 'M-NT', 'O', 'S-NR', 'S-NS', 'S-NT', '[CLS]', '[SEP]'] + + +class OntoNotes4Processor(DataProcessor): + """Processor for the OntoNotes4 data set.""" + + def get_labels(self): + return ['B-GPE', 'B-LOC', 'B-ORG', 'B-PER', 'E-GPE', 'E-LOC', + 'E-ORG', 'E-PER', 'M-GPE', 'M-LOC', 'M-ORG', 'M-PER', 'O', + 'S-GPE', 'S-LOC', 'S-ORG', 'S-PER', '[CLS]', '[SEP]'] + + +class WeiboProcessor(DataProcessor): + """Processor for the Weibo data set.""" + + def get_labels(self): + return ['B-GPE.NAM', 'B-GPE.NOM', 'B-LOC.NAM', 'B-LOC.NOM', + 'B-ORG.NAM', 'B-ORG.NOM', 'B-PER.NAM', 'B-PER.NOM', 'E-GPE.NAM', + 'E-GPE.NOM', 'E-LOC.NAM', 'E-LOC.NOM', 'E-ORG.NAM', 'E-ORG.NOM', + 'E-PER.NAM', 'E-PER.NOM', 'M-GPE.NAM', 'M-LOC.NAM', 'M-LOC.NOM', + 'M-ORG.NAM', 'M-ORG.NOM', 'M-PER.NAM', 'M-PER.NOM', 'O', + 'S-GPE.NAM', 'S-LOC.NOM', 'S-PER.NAM', 'S-PER.NOM', '[CLS]', '[SEP]'] + + +class ResumeProcessor(DataProcessor): + """Processor for the resume data set.""" + + def get_labels(self): + return ['B-CONT', 'B-EDU', 'B-LOC', 'B-NAME', 'B-ORG', 'B-PRO', + 'B-RACE', 'B-TITLE', 'E-CONT', 'E-EDU', 'E-LOC', 'E-NAME', + 'E-ORG', 'E-PRO', 'E-RACE', 'E-TITLE', 'M-CONT', 'M-EDU', + 'M-LOC', 'M-NAME', 'M-ORG', 'M-PRO', 'M-RACE', 'M-TITLE', + 'O', 'S-NAME', 'S-ORG', 'S-RACE', '[CLS]', '[SEP]'] + + +class CMeEEProcessor(DataProcessor): + """Processor for the CMeEE data set.""" + + def get_quotechar(self): + return '\t' + + def get_labels(self): + return ['B-临床表现', 'B-医学检验项目', 'B-医疗程序', 'B-医疗设备', + 'B-微生物类', 'B-疾病', 'B-科室', 'B-药物', 'B-身体', 'I-临床表现', + 'I-医学检验项目', 'I-医疗程序', 'I-医疗设备', 'I-微生物类', + 'I-疾病', 'I-科室', 'I-药物', 'I-身体', 'O', '[CLS]', '[SEP]'] + + +class CLUENERProcessor(DataProcessor): + """Processor for the CLUENER data set.""" + + def get_quotechar(self): + return '\t' + + def get_labels(self): + return ['B-书名', 'B-公司', 'B-地址', 'B-姓名', 'B-政府', 'B-景点', + 'B-游戏', 'B-电影', 'B-组织机构', 'B-职位', 'I-书名', 'I-公司', + 'I-地址', 'I-姓名', 'I-政府', 'I-景点', 'I-游戏', 'I-电影', + 'I-组织机构', 'I-职位', 'O', '[CLS]', '[SEP]'] + + +class TaskDataset(Dataset): + def __init__(self, data_path, processor, mode='train'): + super().__init__() + self.data = self.load_data(data_path, processor, mode) + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return self.data[index] + + def load_data(self, data_path, processor, mode): + if mode == "train": + examples = processor.get_examples(data_path, mode) + elif mode == "test": + examples = processor.get_examples(data_path, mode) + elif mode == "dev": + examples = processor.get_examples(data_path, mode) + return examples + + +@dataclass +class TaskCollator: + args = None + tokenizer = None + ngram_dict = None + label2id = None + + def __call__(self, samples): + features = convert_examples_to_features(samples, self.label2id, self.args.max_seq_length, self.tokenizer, self.ngram_dict) + # logger.info(" Num examples = %d", len(samples)) + + input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) + valid_ids = torch.tensor([f.valid_ids for f in features], dtype=torch.long) + + ngram_ids = torch.tensor([f.ngram_ids for f in features], dtype=torch.long) + ngram_positions = torch.tensor([f.ngram_positions for f in features], dtype=torch.long) + # ngram_lengths = torch.tensor([f.ngram_lengths for f in features], dtype=torch.long) + # ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in features], dtype=torch.long) + # ngram_masks = torch.tensor([f.ngram_masks for f in features], dtype=torch.long) + + # label_mask = torch.tensor([f.label_mask for f in features], dtype=torch.long) + b_use_valid_filter = torch.tensor([f.b_use_valid_filter for f in features], dtype=torch.bool) + # 取第一个出来? + # b_use_valid_filter = b_use_valid_filter.detach().cpu().numpy()[0] + b_use_valid_filter = b_use_valid_filter[0] + return { + 'input_ids': input_ids, + 'input_ngram_ids': ngram_ids, + 'ngram_position_matrix': ngram_positions, + 'attention_mask': input_mask, + 'token_type_ids': segment_ids, + 'labels': label_ids, + 'valid_ids': valid_ids, + 'b_use_valid_filter': b_use_valid_filter, + } + + +class TaskDataModel(pl.LightningDataModule): + @staticmethod + def add_data_specific_args(parent_args): + parser = parent_args.add_argument_group('TASK NAME DataModel') + parser.add_argument('--data_dir', default='./data', type=str) + parser.add_argument('--num_workers', default=8, type=int) + parser.add_argument('--train_data', default='train.json', type=str) + parser.add_argument('--valid_data', default='dev.json', type=str) + parser.add_argument('--test_data', default='test.json', type=str) + parser.add_argument('--train_batchsize', default=16, type=int) + parser.add_argument('--valid_batchsize', default=32, type=int) + parser.add_argument('--max_seq_length', default=128, type=int) + + parser.add_argument('--texta_name', default='text', type=str) + parser.add_argument('--textb_name', default='sentence2', type=str) + parser.add_argument('--label_name', default='label', type=str) + parser.add_argument('--id_name', default='id', type=str) + + parser.add_argument('--dataset_name', default=None, type=str) + parser.add_argument('--vocab_file', + type=str, default=None, + help="Vocabulary mapping/file BERT was pretrainined on") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument('--task_name', default='weibo', type=str) + + return parent_args + + def __init__(self, args): + super().__init__() + self.train_batchsize = args.train_batchsize + self.valid_batchsize = args.valid_batchsize + self.collator = TaskCollator() + self.collator.args = args + self.collator.tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) + self.collator.ngram_dict = ZenNgramDict.from_pretrained(args.pretrained_model_path, tokenizer=self.collator.tokenizer) + + processors = { + 'weibo': WeiboProcessor, + 'resume': ResumeProcessor, + 'msra': MSRAProcessor, + 'ontonotes4': OntoNotes4Processor, + 'cmeee': CMeEEProcessor, + 'cluener': CLUENERProcessor, + } + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + # 生成id映射 + label_list = processor.get_labels() + label2id = {label: i for i, label in enumerate(label_list, 1)} + label2id["[PAD]"] = 0 + self.id2label = {v: k for k, v in label2id.items()} + self.collator.label2id = label2id + + if args.dataset_name is None: + self.train_data = TaskDataset(os.path.join( + args.data_dir, args.train_data), processor, mode='train') + self.valid_data = TaskDataset(os.path.join( + args.data_dir, args.valid_data), processor, mode='dev') + self.test_data = TaskDataset(os.path.join( + args.data_dir, args.test_data), processor, mode='test') + + else: + import datasets + ds = datasets.load_dataset(args.dataset_name) + self.train_data = ds['train'] + self.valid_data = ds['validation'] + self.test_data = ds['test'] + self.save_hyperparameters(args) + + def train_dataloader(self): + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batchsize, pin_memory=False, + collate_fn=self.collator) + + def val_dataloader(self): + return DataLoader(self.valid_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + def predict_dataloader(self): + return DataLoader(self.test_data, shuffle=False, batch_size=self.valid_batchsize, pin_memory=False, + collate_fn=self.collator) + + +class LitModel(pl.LightningModule): + + @staticmethod + def add_model_specific_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + parser.add_argument('--markup', default='bios', type=str) + parser.add_argument('--middle_prefix', default='I-', type=str) + return parent_args + + def __init__(self, args, id2label): + super().__init__() + # config = ZenConfig(os.path.join(args.pretrained_model_path, 'config.json')) + self.model = ZenForTokenClassification.from_pretrained(args.pretrained_model_path, num_labels=len(id2label)) + self.seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix) + self.train_seq_entity_score = SeqEntityScore(id2label, markup=args.markup, middle_prefix=args.middle_prefix) + self.id2label = id2label + self.label2id = {v: k for k, v in id2label.items()} + self.save_hyperparameters(args) + + def setup(self, stage) -> None: + if stage == 'fit': + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + if self.trainer.max_epochs > 0: + world_size = self.trainer.world_size + tb_size = self.hparams.train_batchsize * max(1, world_size) + ab_size = self.trainer.accumulate_grad_batches + self.total_steps = (len(train_loader.dataset) * + self.trainer.max_epochs // tb_size) // ab_size + else: + self.total_steps = self.trainer.max_steps // self.trainer.accumulate_grad_batches + + print('Total steps: {}' .format(self.total_steps)) + + def training_step(self, batch, batch_idx): + outputs = self.model(**batch) + loss = outputs.loss + # logits = outputs.logits + # preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2) + # preds = preds.detach().cpu().numpy() + # labels = batch['labels'].detach().cpu().numpy() + # num_labels = len(self.label2id) + # y_true = [] + # y_pred = [] + # for i, label in enumerate(labels): + # temp_1 = [] + # temp_2 = [] + # for j, m in enumerate(label): + # if j == 0: + # continue + # elif labels[i][j] == num_labels - 1: + # y_true.append(temp_1) + # y_pred.append(temp_2) + # break + # else: + # temp_1.append(self.id2label[labels[i][j]]) + # temp_2.append(self.id2label[preds[i][j]]) + + # self.train_seq_entity_score.update(y_true, y_pred) + # result = self.train_seq_entity_score.result() + # self.train_seq_entity_score.reset() + self.log('train_loss', loss) + + return loss + + def validation_step(self, batch, batch_idx): + outputs = self.model(**batch) + loss = outputs.loss + logits = outputs.logits + preds = torch.argmax(F.log_softmax(logits, dim=2), dim=2) + preds = preds.detach().cpu().numpy() + labels = batch['labels'].detach().cpu().numpy() + num_labels = len(self.label2id) + y_true = [] + y_pred = [] + for i, label in enumerate(labels): + temp_1 = [] + temp_2 = [] + for j, m in enumerate(label): + if j == 0: + continue + elif labels[i][j] == num_labels - 1: + y_true.append(temp_1) + y_pred.append(temp_2) + break + else: + temp_1.append(self.id2label[labels[i][j]]) + temp_2.append(self.id2label[preds[i][j]]) + + self.seq_entity_score.update(y_true, y_pred) + self.log('val_loss', loss) + + def validation_epoch_end(self, outputs): + # compute metric for all process + score_dict, _ = self.seq_entity_score.result() + if self.trainer._accelerator_connector.cluster_environment.global_rank() == 0: + print('score_dict:\n', score_dict) + # reset the metric after once validation + self.seq_entity_score.reset() + for k, v in score_dict.items(): + self.log('val_{}'.format(k), v) + + def configure_optimizers(self): + from fengshen.models.model_utils import configure_optimizers + return configure_optimizers(self) + + +class TaskModelCheckpoint: + @staticmethod + def add_argparse_args(parent_args): + parser = parent_args.add_argument_group('BaseModel') + + parser.add_argument('--monitor', default='train_loss', type=str) + parser.add_argument('--mode', default='min', type=str) + parser.add_argument('--dirpath', default='./log/', type=str) + parser.add_argument( + '--filename', default='model-{epoch:02d}-{train_loss:.4f}', type=str) + + parser.add_argument('--save_top_k', default=3, type=float) + parser.add_argument('--every_n_train_steps', default=100, type=float) + parser.add_argument('--save_weights_only', default=True, type=bool) + + return parent_args + + def __init__(self, args): + self.callbacks = ModelCheckpoint(monitor=args.monitor, + save_top_k=args.save_top_k, + mode=args.mode, + every_n_train_steps=args.every_n_train_steps, + save_weights_only=args.save_weights_only, + dirpath=args.dirpath, + filename=args.filename) + + +def save_test(data, args, data_model): + with open(args.output_save_path, 'w', encoding='utf-8') as f: + idx = 0 + for i in range(len(data)): + batch = data[i] + for sample in batch: + tmp_result = dict() + label_id = np.argmax(sample.numpy()) + tmp_result['id'] = data_model.test_data.data[idx]['id'] + tmp_result['label'] = data_model.id2label[label_id] + json_data = json.dumps(tmp_result, ensure_ascii=False) + f.write(json_data+'\n') + idx += 1 + print('save the result to '+args.output_save_path) + + +def main(): + total_parser = argparse.ArgumentParser("TASK NAME") + total_parser.add_argument('--pretrained_model_path', default='', type=str) + total_parser.add_argument('--output_save_path', + default='./predict.json', type=str) + # * Args for data preprocessing + total_parser = TaskDataModel.add_data_specific_args(total_parser) + # * Args for training + total_parser = pl.Trainer.add_argparse_args(total_parser) + total_parser = TaskModelCheckpoint.add_argparse_args(total_parser) + + # * Args for base model + from fengshen.models.model_utils import add_module_args + total_parser = add_module_args(total_parser) + total_parser = LitModel.add_model_specific_args(total_parser) + + args = total_parser.parse_args() + + checkpoint_callback = TaskModelCheckpoint(args).callbacks + lr_monitor = LearningRateMonitor(logging_interval='step') + trainer = pl.Trainer.from_argparse_args(args, + callbacks=[checkpoint_callback, lr_monitor] + ) + + data_model = TaskDataModel(args) + id2label = data_model.id2label + print('id2label:', id2label) + model = LitModel(args, id2label) + trainer.fit(model, data_model) + + +if __name__ == "__main__": + main() diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_afqmc.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_afqmc.sh new file mode 100644 index 0000000000000000000000000000000000000000..7143e61be485f0d6dc2d7912b5b30250df408b75 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_base_afqmc.sh @@ -0,0 +1,94 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_afqmc # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=afqmc + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +# PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name afqmc \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 2 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_cmnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_cmnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..f6f4f7e9eec1d11a2bf1d09f8d57303ca139f8e2 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_base_cmnli.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_cmnli # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='4' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=cmnli + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/cmnli_public/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 64 \ + --valid_batchsize 32 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name cmnli \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 3 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_iflytek.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_iflytek.sh new file mode 100644 index 0000000000000000000000000000000000000000..9171a7c3264a856915fd9147096f097b8ebd43c8 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_base_iflytek.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_iflytek # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='0' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=iflytek + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name iflytek \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 119 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_ocnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_ocnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..f635330a4b260391a3f9d4b01998ce8305d55b8e --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_base_ocnli.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_ocnli # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='1' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=ocnli + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name ocnli \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 3 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_base_tnews.sh b/fengshen/examples/zen2_finetune/fs_zen2_base_tnews.sh new file mode 100644 index 0000000000000000000000000000000000000000..dee88afbe2639a514745771538d6c0d40e8d3329 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_base_tnews.sh @@ -0,0 +1,94 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_tnews # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=tnews + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +# PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 +PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test1.1.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name tnews \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --num_labels 15 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 400 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 400 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_afqmc.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_afqmc.sh new file mode 100644 index 0000000000000000000000000000000000000000..1f44844a127b5bb39226c56b70bba85957dd735a --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_large_afqmc.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_afqmc # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='1' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=afqmc + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name afqmc \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 2 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_cmnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_cmnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..b2d6dfff35668596c0c748003b7b937d98604922 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_large_cmnli.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_cmnli # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='3' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=cmnli + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/cmnli_public/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 32 \ + --valid_batchsize 32 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name cmnli \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 3 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_iflytek.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_iflytek.sh new file mode 100644 index 0000000000000000000000000000000000000000..7afd7b24d27ddd1a6834935222a100351111d570 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_large_iflytek.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_iflytek # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='5' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=iflytek + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name iflytek \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 119 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 7 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_ocnli.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_ocnli.sh new file mode 100644 index 0000000000000000000000000000000000000000..5598ee8027a9bc41c4c196d71d98341557e0f4eb --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_large_ocnli.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_ocnli # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +export CUDA_VISIBLE_DEVICES='6' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=ocnli + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name ocnli \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --num_labels 3 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/fs_zen2_large_tnews.sh b/fengshen/examples/zen2_finetune/fs_zen2_large_tnews.sh new file mode 100644 index 0000000000000000000000000000000000000000..ec081cd3191f951c3815af423329540a219b0114 --- /dev/null +++ b/fengshen/examples/zen2_finetune/fs_zen2_large_tnews.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_tnews # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=tnews + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/classification_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/yangping/data/ChineseCLUE_DATA/${TASK}_public/ +PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.json \ + --valid_data dev.json \ + --test_data test1.1.json \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 128 \ + --texta_name sentence \ + --label_name label \ + --id_name id \ + --task_name tnews \ + " + +MODEL_ARGS="\ + --learning_rate 2e-5 \ + --weight_decay 0.01 \ + --warmup_ratio 0.01 \ + --num_labels 15 \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_acc \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 400 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_acc:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 400 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_sequence_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_cluener.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_cluener.sh new file mode 100644 index 0000000000000000000000000000000000000000..04b97b5fe5123af3170523dfde0ae008a78b2428 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_base_cluener.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_cluener # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_cluener/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=cluener + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CLUENER/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.txt \ + --valid_data dev.char.txt \ + --test_data dev.char.txt \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name cluener \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bio \ + --middle_prefix I- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_cmeee.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_cmeee.sh new file mode 100644 index 0000000000000000000000000000000000000000..a4be7221a250030db4cf1b7d157f1d6c0fd4b0f0 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_base_cmeee.sh @@ -0,0 +1,92 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_cmeee # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=2 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:2 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/lujunyu/experiments/ner_finetune/zen2_base_cmeee/%x-%j.log # output and error file name (%x=job name, %j=job id) +#SBATCH -p hgx + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/lujunyu/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=cmeee + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/lujunyu/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CMeEE_copy/ +PRETRAINED_MODEL_PATH=/cognitive_comp/lujunyu/pretrain_models/zen2-base-med + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.bio \ + --valid_data dev.char.bio \ + --test_data dev.char.bio \ + --train_batchsize 16 \ + --valid_batchsize 16 \ + --max_seq_length 512 \ + --task_name cmeee \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bio \ + --middle_prefix I- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 10 \ + --gpus 2 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 0.25 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/lujunyu/Fengshenbang-LM-Git/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +srun python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_msra.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_msra.sh new file mode 100644 index 0000000000000000000000000000000000000000..397c3ea6adc3d9f275389509aa41d0e4050b3c14 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_base_msra.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_msra # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_msra/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=msra + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/MSRA/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train_dev.char.bmes \ + --valid_data test.char.bmes \ + --test_data test.char.bmes \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name msra \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 800 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 800 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_ontonotes4.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_ontonotes4.sh new file mode 100644 index 0000000000000000000000000000000000000000..1e1237967712a6862e5770e90d4e8db8d074d320 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_base_ontonotes4.sh @@ -0,0 +1,92 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_ontonotes4 # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_ontonotes4/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=ontonotes4 + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/OntoNotes4/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 +PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.bmes \ + --valid_data test.char.bmes \ + --test_data test.char.bmes \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name ontonotes4 \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 200 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_resume.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_resume.sh new file mode 100644 index 0000000000000000000000000000000000000000..a7aee577ed035c0f39b883aa8a2a4dd6fffd479d --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_base_resume.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_resume # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_resume/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=resume + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/Resume/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.bmes \ + --valid_data test.char.bmes \ + --test_data test.char.bmes \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name resume \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_base_weibo.sh b/fengshen/examples/zen2_finetune/ner_zen2_base_weibo.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3f4667e59fe0b7ba98f37dec65e12fdf6faf555 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_base_weibo.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_base_weibo # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_base_weibo/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_base + +TASK=weibo + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_base_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.all.bmes \ + --valid_data test.all.bmes \ + --test_data test.all.bmes \ + --train_batchsize 32 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name weibo \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 20 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_cluener.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_cluener.sh new file mode 100644 index 0000000000000000000000000000000000000000..07193e3f15ca69755853623a57fee0a573db6593 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_large_cluener.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_cluener # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_cluener/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=cluener + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CLUENER/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.txt \ + --valid_data dev.char.txt \ + --test_data dev.char.txt \ + --train_batchsize 16 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name cluener \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bio \ + --middle_prefix I- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 200 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_cmeee.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_cmeee.sh new file mode 100644 index 0000000000000000000000000000000000000000..02409b04501bf6155481673b3acd0bd22914d3f3 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_large_cmeee.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_cmeee # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_cmeee/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=cmeee + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/CMeEE/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.bio \ + --valid_data dev.char.bio \ + --test_data dev.char.bio \ + --train_batchsize 16 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name cmeee \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bio \ + --middle_prefix I- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 200 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_msra.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_msra.sh new file mode 100644 index 0000000000000000000000000000000000000000..cef8f1f70babc94ed77dc585fbba47f5b45ff7a5 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_large_msra.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_msra # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_msra/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=msra + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/MSRA/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train_dev.char.bmes \ + --valid_data test.char.bmes \ + --test_data test.char.bmes \ + --train_batchsize 16 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name msra \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 800 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 800 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_ontonotes4.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_ontonotes4.sh new file mode 100644 index 0000000000000000000000000000000000000000..f8bb41316b4cec4bb94fa36ac9bc39c9f3ce41f8 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_large_ontonotes4.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_ontonotes4 # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_ontonotes4/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=ontonotes4 + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/OntoNotes4/ +PRETRAINED_MODEL_PATH=IDEA-CCNL/Erlangshen-ZEN2-345M-Chinese + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.bmes \ + --valid_data test.char.bmes \ + --test_data test.char.bmes \ + --train_batchsize 16 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name ontonotes4 \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 200 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 200 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_resume.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_resume.sh new file mode 100644 index 0000000000000000000000000000000000000000..e21a61f48a96f1d831c90d3cbc3a9cbe8eb7de38 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_large_resume.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_resume # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o /cognitive_comp/ganruyi/experiments/ner_finetune/zen2_large_resume/%x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=resume + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/Resume/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.char.bmes \ + --valid_data test.char.bmes \ + --test_data test.char.bmes \ + --train_batchsize 16 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name resume \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 100 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/examples/zen2_finetune/ner_zen2_large_weibo.sh b/fengshen/examples/zen2_finetune/ner_zen2_large_weibo.sh new file mode 100644 index 0000000000000000000000000000000000000000..7fab2998437ef8c12dcd93466371d0324eec4c79 --- /dev/null +++ b/fengshen/examples/zen2_finetune/ner_zen2_large_weibo.sh @@ -0,0 +1,91 @@ +#!/bin/bash +#SBATCH --job-name=zen2_large_weibo # create a short name for your job +#SBATCH --nodes=1 # node count +#SBATCH --ntasks=1 # total number of tasks across all nodes +#SBATCH --cpus-per-task=30 # cpu-cores per task (>1 if multi-threaded tasks) +#SBATCH --gres=gpu:1 # number of gpus per node +#SBATCH --mail-type=ALL # send email when job begins, ends or failed etc. +#SBATCH -o %x-%j.log # output and error file name (%x=job name, %j=job id) + + +# export CUDA_VISIBLE_DEVICES='2' +export TORCH_EXTENSIONS_DIR=/cognitive_comp/ganruyi/tmp/torch_extendsions + +MODEL_NAME=zen2_large + +TASK=weibo + +ZERO_STAGE=1 +STRATEGY=deepspeed_stage_${ZERO_STAGE} + +ROOT_DIR=/cognitive_comp/ganruyi/experiments/ner_finetune/${MODEL_NAME}_${TASK} +if [ ! -d ${ROOT_DIR} ];then + mkdir -p ${ROOT_DIR} + echo ${ROOT_DIR} created!!!!!!!!!!!!!! +else + echo ${ROOT_DIR} exist!!!!!!!!!!!!!!! +fi + +DATA_DIR=/cognitive_comp/lujunyu/data_zh/NER_Aligned/weibo/ +PRETRAINED_MODEL_PATH=/cognitive_comp/ganruyi/hf_models/zen/zh_zen_large_2.0 + +CHECKPOINT_PATH=${ROOT_DIR}/ckpt/ +OUTPUT_PATH=${ROOT_DIR}/predict.json + +DATA_ARGS="\ + --data_dir $DATA_DIR \ + --train_data train.all.bmes \ + --valid_data test.all.bmes \ + --test_data test.all.bmes \ + --train_batchsize 16 \ + --valid_batchsize 16 \ + --max_seq_length 256 \ + --task_name weibo \ + " + +MODEL_ARGS="\ + --learning_rate 3e-5 \ + --weight_decay 0.1 \ + --warmup_ratio 0.01 \ + --markup bioes \ + --middle_prefix M- \ + " + +MODEL_CHECKPOINT_ARGS="\ + --monitor val_f1 \ + --save_top_k 3 \ + --mode max \ + --every_n_train_steps 100 \ + --save_weights_only True \ + --dirpath $CHECKPOINT_PATH \ + --filename model-{epoch:02d}-{val_f1:.4f} \ + " + +TRAINER_ARGS="\ + --max_epochs 30 \ + --gpus 1 \ + --check_val_every_n_epoch 1 \ + --val_check_interval 20 \ + --default_root_dir $ROOT_DIR \ + " + + +options=" \ + --pretrained_model_path $PRETRAINED_MODEL_PATH \ + --vocab_file $PRETRAINED_MODEL_PATH/vocab.txt \ + --do_lower_case \ + --output_save_path $OUTPUT_PATH \ + $DATA_ARGS \ + $MODEL_ARGS \ + $MODEL_CHECKPOINT_ARGS \ + $TRAINER_ARGS \ +" +SCRIPT_PATH=/cognitive_comp/ganruyi/Fengshenbang-LM/fengshen/examples/zen2_finetune/fengshen_token_level_ft_task.py +/home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + +# SINGULARITY_PATH=/cognitive_comp/ganruyi/pytorch21_06_py3_docker_image_v2.sif +# python3 $SCRIPT_PATH $options +# source activate base +# singularity exec --nv -B /cognitive_comp/:/cognitive_comp/ $SINGULARITY_PATH /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options +# /home/ganruyi/anaconda3/bin/python $SCRIPT_PATH $options + diff --git a/fengshen/metric/metric.py b/fengshen/metric/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..5588db3726e30fbc955e619ecd24de3c2c5a1952 --- /dev/null +++ b/fengshen/metric/metric.py @@ -0,0 +1,129 @@ +# coding=utf-8 +from collections import Counter +import torch +from torch import nn +# import seqeval + +from .utils_ner import get_entities + + +class metrics_mlm_acc(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, logits, labels, masked_lm_metric): + + # if len(list(logits.shape))==3: + mask_label_size = 0 + for i in masked_lm_metric: + for j in i: + if j > 0: + mask_label_size += 1 + + y_pred = torch.argmax(logits, dim=-1) + + y_pred = y_pred.view(size=(-1,)) + y_true = labels.view(size=(-1,)) + masked_lm_metric = masked_lm_metric.view(size=(-1,)) + + corr = torch.eq(y_pred, y_true) + corr = torch.multiply(masked_lm_metric, corr) + + acc = torch.sum(corr.float())/mask_label_size + return acc + + +class EntityScore(object): + def __init__(self): + self.reset() + + def reset(self): + self.origins = [] + self.founds = [] + self.rights = [] + + def compute(self, origin, found, right): + recall = 0 if origin == 0 else (right / origin) + precision = 0 if found == 0 else (right / found) + f1 = 0. if recall + precision == 0 else (2 * precision * recall) / (precision + recall) + return recall, precision, f1 + + def result(self): + class_info = {} + + origin_counter = Counter([x[0] for x in self.origins]) + found_counter = Counter([x[0] for x in self.founds]) + right_counter = Counter([x[0] for x in self.rights]) + for type_, count in origin_counter.items(): + origin = count + found = found_counter.get(type_, 0) + right = right_counter.get(type_, 0) + recall, precision, f1 = self.compute(origin, found, right) + class_info[type_] = {"acc": round(precision, 4), 'recall': round(recall, 4), 'f1': round(f1, 4)} + origin = len(self.origins) + found = len(self.founds) + right = len(self.rights) + recall, precision, f1 = self.compute(origin, found, right) + return {'acc': precision, 'recall': recall, 'f1': f1}, class_info + + def update(self, true_subject, pred_subject): + self.origins.extend(true_subject) + self.founds.extend(pred_subject) + self.rights.extend([pre_entity for pre_entity in pred_subject if pre_entity in true_subject]) + +class SeqEntityScore(object): + def __init__(self, id2label, markup='bios', middle_prefix='I-'): + self.id2label = id2label + self.markup = markup + self.middle_prefix = middle_prefix + self.reset() + + def reset(self): + self.origins = [] + self.founds = [] + self.rights = [] + + def compute(self, origin, found, right): + recall = 0 if origin == 0 else (right / origin) + precision = 0 if found == 0 else (right / found) + f1 = 0. if recall + precision == 0 else (2 * precision * recall) / (precision + recall) + return recall, precision, f1 + + def result(self): + class_info = {} + origin_counter = Counter([x[0] for x in self.origins]) + found_counter = Counter([x[0] for x in self.founds]) + right_counter = Counter([x[0] for x in self.rights]) + for type_, count in origin_counter.items(): + origin = count + found = found_counter.get(type_, 0) + right = right_counter.get(type_, 0) + # print('origin:', origin, ' found:', found, ' right:', right) + recall, precision, f1 = self.compute(origin, found, right) + class_info[type_] = {"acc": round(precision, 4), 'recall': round(recall, 4), 'f1': round(f1, 4)} + origin = len(self.origins) + found = len(self.founds) + right = len(self.rights) + recall, precision, f1 = self.compute(origin, found, right) + return {'acc': precision, 'recall': recall, 'f1': f1}, class_info + + def update(self, label_paths, pred_paths): + ''' + labels_paths: [[],[],[],....] + pred_paths: [[],[],[],.....] + + :param label_paths: + :param pred_paths: + :return: + Example: + >>> labels_paths = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + >>> pred_paths = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] + ''' + for label_path, pre_path in zip(label_paths, pred_paths): + label_entities = get_entities(label_path, self.id2label, self.markup, self.middle_prefix) + pre_entities = get_entities(pre_path, self.id2label, self.markup, self.middle_prefix) + # print('label:', label_path, ',label_entities: ', label_entities) + # print('pred:', pre_path, ',pre_entities: ', pre_entities) + self.origins.extend(label_entities) + self.founds.extend(pre_entities) + self.rights.extend([pre_entity for pre_entity in pre_entities if pre_entity in label_entities]) diff --git a/fengshen/metric/utils_ner.py b/fengshen/metric/utils_ner.py new file mode 100644 index 0000000000000000000000000000000000000000..20efe33defdcbef59d75e83a1bf993eaadd962c8 --- /dev/null +++ b/fengshen/metric/utils_ner.py @@ -0,0 +1,261 @@ +import csv +import json +import torch +from transformers import BertTokenizer + + +class CNerTokenizer(BertTokenizer): + def __init__(self, vocab_file, do_lower_case=True): + super().__init__(vocab_file=str(vocab_file), do_lower_case=do_lower_case) + self.vocab_file = str(vocab_file) + self.do_lower_case = do_lower_case + + def tokenize(self, text): + _tokens = [] + for c in text: + if self.do_lower_case: + c = c.lower() + if c in self.vocab: + _tokens.append(c) + else: + _tokens.append('[UNK]') + return _tokens + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_text(self, input_file): + lines = [] + with open(input_file, 'r') as f: + words = [] + labels = [] + for line in f: + if line.startswith("-DOCSTART-") or line == "" or line == "\n": + if words: + lines.append({"words": words, "labels": labels}) + words = [] + labels = [] + else: + splits = line.split(" ") + words.append(splits[0]) + if len(splits) > 1: + labels.append(splits[-1].replace("\n", "")) + else: + # Examples could have no label for mode = "test" + labels.append("O") + if words: + lines.append({"words": words, "labels": labels}) + return lines + + @classmethod + def _read_json(self, input_file): + lines = [] + with open(input_file, 'r', encoding='utf8') as f: + for line in f: + line = json.loads(line.strip()) + text = line['text'] + label_entities = line.get('label', None) + words = list(text) + labels = ['O'] * len(words) + if label_entities is not None: + for key, value in label_entities.items(): + for sub_name, sub_index in value.items(): + for start_index, end_index in sub_index: + assert ''.join(words[start_index:end_index+1]) == sub_name + if start_index == end_index: + labels[start_index] = 'S-'+key + else: + if end_index - start_index == 1: + labels[start_index] = 'B-' + key + labels[end_index] = 'E-' + key + else: + labels[start_index] = 'B-' + key + labels[start_index + 1:end_index] = ['I-' + key] * (len(sub_name) - 2) + labels[end_index] = 'E-' + key + lines.append({"words": words, "labels": labels}) + return lines + + +def get_entity_bios(seq, id2label, middle_prefix='I-'): + """Gets entities from sequence. + note: BIOS + Args: + seq (list): sequence of labels. + Returns: + list: list of (chunk_type, chunk_start, chunk_end). + Example: + # >>> seq = ['B-PER', 'I-PER', 'O', 'S-LOC'] + # >>> get_entity_bios(seq) + [['PER', 0,1], ['LOC', 3, 3]] + """ + chunks = [] + chunk = [-1, -1, -1] + for indx, tag in enumerate(seq): + if not isinstance(tag, str): + tag = id2label[tag] + if tag.startswith("S-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[2] = indx + chunk[0] = tag.split('-')[1] + chunks.append(chunk) + chunk = (-1, -1, -1) + if tag.startswith("B-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[0] = tag.split('-')[1] + elif tag.startswith(middle_prefix) and chunk[1] != -1: + _type = tag.split('-')[1] + if _type == chunk[0]: + chunk[2] = indx + if indx == len(seq) - 1: + chunks.append(chunk) + else: + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + return chunks + + +def get_entity_bio(seq, id2label, middle_prefix='I-'): + """Gets entities from sequence. + note: BIO + Args: + seq (list): sequence of labels. + Returns: + list: list of (chunk_type, chunk_start, chunk_end). + Example: + seq = ['B-PER', 'I-PER', 'O', 'B-LOC'] + get_entity_bio(seq) + #output + [['PER', 0,1], ['LOC', 3, 3]] + """ + chunks = [] + chunk = [-1, -1, -1] + for indx, tag in enumerate(seq): + if not isinstance(tag, str): + tag = id2label[tag] + if tag.startswith("B-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[0] = tag.split('-')[1] + chunk[2] = indx + if indx == len(seq) - 1: + chunks.append(chunk) + elif tag.startswith(middle_prefix) and chunk[1] != -1: + _type = tag.split('-')[1] + if _type == chunk[0]: + chunk[2] = indx + + if indx == len(seq) - 1: + chunks.append(chunk) + else: + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + return chunks + + +def get_entity_bioes(seq, id2label, middle_prefix='I-'): + """Gets entities from sequence. + note: BIOS + Args: + seq (list): sequence of labels. + Returns: + list: list of (chunk_type, chunk_start, chunk_end). + Example: + # >>> seq = ['B-PER', 'I-PER', 'O', 'S-LOC'] + # >>> get_entity_bios(seq) + [['PER', 0,1], ['LOC', 3, 3]] + """ + chunks = [] + chunk = [-1, -1, -1] + for indx, tag in enumerate(seq): + if not isinstance(tag, str): + tag = id2label[tag] + if tag.startswith("S-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[2] = indx + chunk[0] = tag.split('-')[1] + chunks.append(chunk) + chunk = (-1, -1, -1) + if tag.startswith("B-"): + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + chunk[1] = indx + chunk[0] = tag.split('-')[1] + elif (tag.startswith(middle_prefix) or tag.startswith("E-")) and chunk[1] != -1: + _type = tag.split('-')[1] + if _type == chunk[0]: + chunk[2] = indx + if indx == len(seq) - 1: + chunks.append(chunk) + else: + if chunk[2] != -1: + chunks.append(chunk) + chunk = [-1, -1, -1] + return chunks + + +def get_entities(seq, id2label, markup='bio', middle_prefix='I-'): + ''' + :param seq: + :param id2label: + :param markup: + :return: + ''' + assert markup in ['bio', 'bios', 'bioes'] + if markup == 'bio': + return get_entity_bio(seq, id2label, middle_prefix) + elif markup == 'bios': + return get_entity_bios(seq, id2label, middle_prefix) + else: + return get_entity_bioes(seq, id2label, middle_prefix) + + +def bert_extract_item(start_logits, end_logits): + S = [] + start_pred = torch.argmax(start_logits, -1).cpu().numpy()[0][1:-1] + end_pred = torch.argmax(end_logits, -1).cpu().numpy()[0][1:-1] + for i, s_l in enumerate(start_pred): + if s_l == 0: + continue + for j, e_l in enumerate(end_pred[i:]): + if s_l == e_l: + S.append((s_l, i, i + j)) + break + return S diff --git a/fengshen/models/DAVAE/BertForLatentConnector.py b/fengshen/models/DAVAE/BertForLatentConnector.py new file mode 100644 index 0000000000000000000000000000000000000000..08dffce16874a4b263fb604380e5490645cb483e --- /dev/null +++ b/fengshen/models/DAVAE/BertForLatentConnector.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import os +import sys +from io import open + +import pdb + +import torch +from torch import nn +from transformers import BertConfig,BertPreTrainedModel +from transformers.models.bert.modeling_bert import BertEmbeddings,BertEncoder,BertPooler + + +class BertForLatentConnector(BertPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config, latent_size): + super(BertForLatentConnector, self).__init__(config) + + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + + self.linear = nn.Linear(config.hidden_size, 2 * latent_size, bias=False) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.embeddings.word_embeddings + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.embeddings.word_embeddings = new_embeddings + return self.embeddings.word_embeddings + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, emb_noise=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) + + if emb_noise is not None: + embedding_output = embedding_output + emb_noise(embedding_output).to(embedding_output.dtype) + + encoder_outputs = self.encoder(embedding_output, + extended_attention_mask, + head_mask=head_mask) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) + + outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + return outputs # sequence_output, pooled_output, (hidden_states), (attentions) diff --git a/fengshen/models/DAVAE/DAVAEModel.py b/fengshen/models/DAVAE/DAVAEModel.py new file mode 100644 index 0000000000000000000000000000000000000000..24261832e029417651b6e61738b391bfc244b8b1 --- /dev/null +++ b/fengshen/models/DAVAE/DAVAEModel.py @@ -0,0 +1,235 @@ +import os +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import BertConfig,TransfoXLConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import cached_path,hf_bucket_url +from fengshen.models.DAVAE.GPT2ModelForLatent import GPT2ModelForLatent +from fengshen.models.DAVAE.BertForLatentConnector import BertForLatentConnector +from fengshen.models.DAVAE.run_latent_generation import * +# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def log_sum_exp(value, dim=None, keepdim=False): + """Numerically stable implementation of the operation + value.exp().sum(dim, keepdim).log() + """ + if dim is not None: + m, _ = torch.max(value, dim=dim, keepdim=True) + value0 = value - m + if keepdim is False: + m = m.squeeze(dim) + return m + torch.log(torch.sum(torch.exp(value0), dim=dim, keepdim=keepdim)) + else: + m = torch.max(value) + sum_exp = torch.sum(torch.exp(value - m)) + return m + torch.log(sum_exp) + +class VAEPretrainedModel(PreTrainedModel): + def _init_weights(self, module): + """ Initialize the weights """ + pass # to bypass the not implement error + +class DAVAEModel(VAEPretrainedModel): + config_class = PretrainedConfig + def __init__(self, config:PretrainedConfig,*model_args, **model_kwargs): + super().__init__(config=config) + self.config = config + self.vae_model = DAVAEModel.load_model(self.config) + + @classmethod + def load_model(cls, config): + encoder_config = BertConfig.from_dict(config.encoder) + encoder_model = BertForLatentConnector(config=encoder_config, latent_size=config.latent_size) + dec_config = TransfoXLConfig.from_dict(config.decoder) + dec_config.latent_size = config.latent_size + decoder_model = GPT2ModelForLatent(config=dec_config) + vae_model = EncDecAAE(config,encoder_model, decoder_model, dec_config.latent_size, pad_token_id=50000) + return vae_model + + def set_tokenizers(self,encoder_tokenizer,decoder_tokenizer): + if not hasattr(self, 'encoder_tokenizer'): + self.encoder_tokenizer = encoder_tokenizer + if not hasattr(self, 'decoder_tokenizer'): + self.decoder_tokenizer = decoder_tokenizer + + def simulate_batch(self,encoder_tokenizer,decoder_tokenizer, sent_inputs, prompt=None): + self.set_tokenizers(encoder_tokenizer,decoder_tokenizer) + # 生成相似句 + latent_z = self.latent_code_from_text_batch(sent_inputs) + text_analogy = self.text_from_latent_code_batch(latent_z,prompt=prompt) + return text_analogy + + def latent_code_from_text_batch(self,texts): + # texts->latents + tokens_tensor_list = [] + for text in texts: + tokens = self.encoder_tokenizer.encode(text)[:510] + tokens_tensor_list.append(torch.tensor([101]+tokens+[102])) + + coded = pad_sequence(tokens_tensor_list, batch_first=True, padding_value=0).long() + device = next(self.vae_model.decoder.parameters()).device + with torch.no_grad(): + coded = coded.to(device) + pooled_hidden_fea = self.vae_model.encoder(coded, attention_mask=(coded > 0).float())[1] + mean, logvar = self.vae_model.encoder.linear(pooled_hidden_fea).chunk(2, -1) + + std = logvar.mul(0.5).exp() + eps = torch.zeros_like(std).normal_() + + latent_z = mean + torch.mul(eps, std)*self.config.std_scale + return latent_z + def text_from_latent_code_batch(self,latent_z, prompt=None): + # latents->texts + device = next(self.vae_model.decoder.parameters()).device + past = latent_z + batch_size = latent_z.shape[0] + bos_token = self.decoder_tokenizer.convert_tokens_to_ids(self.decoder_tokenizer.bos_token) + end_token = self.decoder_tokenizer.convert_tokens_to_ids(self.decoder_tokenizer.eos_token) + + if prompt is not None: + prompt = [[bos_token] + self.decoder_tokenizer.encode(text)[:-1] for text in prompt] + else: + prompt = [[bos_token]]*batch_size + + context_tokens_tensor = torch.tensor([[end_token]*self.config.max_out_length]*batch_size).to(device) # 2-d tensor + context_length_tensor = torch.tensor([1]*batch_size).to(device) + for i in range(batch_size): + context_tokens_tensor[i,:len(prompt[i])] = torch.tensor(prompt[i]).long().to(device) + context_length_tensor[i] = len(prompt[i]) + + out = sample_sequence_conditional_batch( + model=self.vae_model.decoder, + max_out_length= self.config.max_out_length, + context_tokens_tensor=context_tokens_tensor, + context_length_tensor=context_length_tensor, + latent_z=latent_z, + temperature=self.config.temperature, + top_k=self.config.top_k, + top_p=self.config.top_p, + repetition_penalty=self.config.repetition_penalty, + device=device + ) + + out_text = [] + for i, tokens in enumerate(out): + tokens = tokens[len(prompt[i]):] + tokens = tokens[:tokens.index(end_token)] if end_token in tokens else tokens + text = self.decoder_tokenizer.decode(tokens, clean_up_tokenization_spaces=True) + out_text.append(filter_noise(text)) + return out_text +class EncDecAAE(nn.Module): + """Adversarial Auto-Encoder""" + def __init__(self,config, encoder, decoder, latent_size, pad_token_id): + super(EncDecAAE, self).__init__() + self.encoder = encoder + self.decoder = decoder + self.config = config + self.pad_token_id = pad_token_id + self.Disc = nn.Sequential(nn.Linear(latent_size, 4*latent_size), nn.ReLU(), + nn.Linear(4*latent_size, 1)) + # Standard Normal prior + loc = torch.zeros(latent_size) + scale = torch.ones(latent_size) + self.prior = torch.distributions.normal.Normal(loc, scale) + + def connect(self, bert_fea, nsamples=1, fb_mode=0): + """ + Returns: Tensor1, Tensor2 + Tensor1: the tensor latent z with shape [batch, nsamples, nz] + Tensor2: the tenor of KL for each x with shape [batch] + """ + # (batch_size, nz) + + mean, logvar = self.encoder.linear(bert_fea).chunk(2, -1) + z = self.reparameterize(mean, logvar, nsamples) + if fb_mode == 0: + KL = 0.5 * (mean.pow(2) + logvar.exp() - logvar - 1).sum(dim=1) + elif fb_mode == 1: + kl_loss = 0.5 * (mean.pow(2) + logvar.exp() - logvar - 1) + kl_mask = (kl_loss > self.config.dim_target_kl).float() + KL = (kl_mask * kl_loss).sum(dim=1) + + return z, KL + + def connect_deterministic(self, bert_fea, nsamples=1): + """ + Returns: Tensor1, Tensor2 + Tensor1: the tensor latent z with shape [batch, nsamples, nz] + Tensor2: the tenor of KL for each x with shape [batch] + """ + + # (batch_size, nz) + + mean, logvar = self.encoder.linear(bert_fea).chunk(2, -1) + logvar = torch.zeros_like(logvar) + z = self.reparameterize(mean, logvar, nsamples) + KL = 0.5 * (mean.pow(2) + logvar.exp() - logvar - 1).sum(dim=1) + + return z, KL + + def reparameterize(self, mu, logvar, nsamples=1): + """sample from posterior Gaussian family + Args: + mu: Tensor + Mean of gaussian distribution with shape (batch, nz) + logvar: Tensor + logvar of gaussian distibution with shape (batch, nz) + Returns: Tensor + Sampled z with shape (batch, nsamples, nz) + """ + batch_size, nz = mu.size() + std = logvar.mul(0.5).exp() + + mu_expd = mu.unsqueeze(1).expand(batch_size, nsamples, nz) + std_expd = std.unsqueeze(1).expand(batch_size, nsamples, nz) + + eps = torch.zeros_like(std_expd).normal_() + + return mu_expd + torch.mul(eps, std_expd) + + def loss_adv(self, z): + zn = torch.randn_like(z) + zeros = torch.zeros(len(z), 1, device=z.device).half() + ones = torch.ones(len(z), 1, device=z.device).half() + + loss_d = F.binary_cross_entropy_with_logits(self.Disc(z.detach().half()), zeros) + \ + F.binary_cross_entropy_with_logits(self.Disc(zn.half()), ones) + loss_g = F.binary_cross_entropy_with_logits(self.Disc(z.half()), ones) + return loss_d, loss_g + + def forward(self, inputs, labels, beta=0.0, iw=None, fb_mode=0, emb_noise=None): + attention_mask = (inputs > 0).float() + reconstrution_mask = (labels != self.pad_token_id).float() # the padding token for GPT2 + sent_length = torch.sum(reconstrution_mask, dim=1) + + outputs = self.encoder(inputs, attention_mask, emb_noise=emb_noise) + pooled_hidden_fea = outputs[1] + + seq_length = labels.size(1) + dec_attn_mask = self.decoder.get_attn_mask(seq_length).to(labels.device) + + if fb_mode in [0,1]: + latent_z, loss_kl = self.connect(pooled_hidden_fea, fb_mode=fb_mode) + latent_z = latent_z.squeeze(1) + outputs = self.decoder(input_ids=labels, attention_mask=dec_attn_mask, latent_state=latent_z, labels=labels, label_ignore=self.pad_token_id) # ignore loss over padding tokens + loss_rec = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) + elif fb_mode==2: + latent_z, loss_kl = self.connect_deterministic(pooled_hidden_fea) + latent_z = latent_z.squeeze(1) + outputs = self.decoder(input_ids=labels, attention_mask=dec_attn_mask, latent_state=latent_z, labels=labels, label_ignore=self.pad_token_id) + loss_rec = outputs[0] # model outputs are always tuple + + if self.config.length_weighted_loss: + loss = loss_rec / sent_length + beta * loss_kl + else: + loss = loss_rec + beta * loss_kl + + if iw!=None: + total_loss = torch.sum(loss*iw)/torch.sum(iw) + else: + total_loss = torch.sum(loss) + return (loss_rec/sent_length).mean(), loss_kl.mean(), total_loss + diff --git a/fengshen/models/DAVAE/GPT2ModelForLatent.py b/fengshen/models/DAVAE/GPT2ModelForLatent.py new file mode 100644 index 0000000000000000000000000000000000000000..47d5f50a73d26bf38d2fcf7d2620ce3d8aa547af --- /dev/null +++ b/fengshen/models/DAVAE/GPT2ModelForLatent.py @@ -0,0 +1,640 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GPT-2 model.""" + +import torch +import torch.nn.functional as F +import math +import torch.nn as nn +from torch.nn import CrossEntropyLoss +# from ......configuration_transfo_xl import TransfoXLConfig +from transformers import TransfoXLConfig + +from transformers.modeling_utils import ( + PreTrainedModel +) + + +class PositionalEmbedding(torch.nn.Module): + def __init__(self, hidden_size): + super(PositionalEmbedding, self).__init__() + + self.hidden_size = hidden_size + + inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[None, :, :].expand(bsz, -1, -1) + else: + return pos_emb[None, :, :] + +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, '{} is not divisible by {}'.format( + numerator, denominator) + +def divide(numerator, denominator): + """Ensure that numerator is divisible by the denominator and return + the division value.""" + ensure_divisibility(numerator, denominator) + return numerator // denominator + +def scaled_init_method(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + +def unscaled_init_method(sigma): + """Init method based on N(0, sigma).""" + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + +@torch.jit.script +def gelu_impl(x): + """OpenAI's gelu implementation.""" + return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * + (1.0 + 0.044715 * x * x))) + +def gelu(x): + return gelu_impl(x) + +class GPT2SelfAttention(torch.nn.Module): + """Parallel self-attention layer for GPT2. + + Self-attention layer takes input with size [b, s, h] where b is + the batch size, s is the sequence lenght, and h is the hidden size + and creates output of the same size. + Arguments: + hidden_size: total hidden size of the layer (h). + num_attention_heads: number of attention heads (n). Note that we + require n to be divisible by number of GPUs + used to parallelize the model. Also, we + require hidden size to be divisible by n. + dropout_prob: dropout probability for the attention scores. + init_method: weight initialization. + output_layer_init_method: output layer initialization. If None, use + `init_method`. + We use the following notation: + h: hidden_size + n: num_attention_heads + p: number of partitions + np: n/p + hp: h/p + hn: h/n + b: batch size + s: sequence length + """ + def __init__(self, hidden_size, num_attention_heads, + attention_dropout_prob, output_dropout_prob, + init_method, output_layer_init_method=None, relative_encoding=False): + super(GPT2SelfAttention, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Per attention head and per partition values. + self.hidden_size_per_partition = hidden_size + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = num_attention_heads + self.relative_encoding = relative_encoding + # Strided linear layer. + self.query_key_value = torch.nn.Linear(hidden_size, 3*hidden_size, bias=True) + + if relative_encoding: + self.relative = torch.nn.Linear(hidden_size, hidden_size, bias=True) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + # Output. + self.dense = torch.nn.Linear(hidden_size, hidden_size, bias=True) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + @staticmethod + def _rel_shift(x, zero_triu=False): + # ql x kl x bsz x h + # bsz x h x ql x kl + zero_pad = torch.zeros((*x.size()[:-2], x.size(-2), 1), + device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=-1) + + x_padded = x_padded.view(*x.size()[:-2], x.size(-1) + 1, x.size(-2)) + + x = x_padded[:, :, 1:].view_as(x) + + if zero_triu: + ones = torch.ones((x.size(0), x.size(1))) + x = x * torch.tril(ones, x.size(1) - x.size(0))[:, :, None, None] + + return x + + @staticmethod + def _rel_shift_latest(x: torch.Tensor): + ndims = x.dim() + x_shape = x.size() + row_dim = 2 + col_dim = row_dim + 1 + assert col_dim < ndims + tgt_shape_1, tgt_shape_2 = [], [] + for i in range(ndims): + if i == row_dim: + tgt_shape_1.append(x_shape[col_dim]) + tgt_shape_2.append(x_shape[row_dim]) + elif i == col_dim: + tgt_shape_1.append(x_shape[row_dim]) + tgt_shape_2.append(x_shape[col_dim] - 1) + else: + tgt_shape_1.append(x_shape[i]) + tgt_shape_2.append(x_shape[i]) + x = x.view(*tgt_shape_1) + x = x[:, :, 1:, :] + x = x.view(*tgt_shape_2) + return x + + def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Attention heads. [b, s, hp] + query_length = hidden_states.size(1) + + if mem is None: + mixed_x_layer = self.query_key_value(hidden_states) + (mixed_query_layer, + mixed_key_layer, + mixed_value_layer) = torch.chunk(mixed_x_layer, 3, dim=-1) + else: + cat = torch.cat((mem, hidden_states), 1) + mixed_x_layer = self.query_key_value(cat) + (mixed_query_layer, + mixed_key_layer, + mixed_value_layer) = torch.chunk(mixed_x_layer, 3, dim=-1) + mixed_query_layer = mixed_query_layer[:, -query_length:] + + # Reshape and transpose [b, np, s, hn] + query_layer = self._transpose_for_scores(mixed_query_layer) + key_layer = self._transpose_for_scores(mixed_key_layer) + value_layer = self._transpose_for_scores(mixed_value_layer) + if self.relative_encoding: + relative_layer = self.relative(position_embeddings) + relative_layer = self._transpose_for_scores(relative_layer) # 1 (bsz) x n_head x klen x d_head + # Raw attention scores. [b, np, qs, ks] + rw_head_q = query_layer + r_w_bias.unsqueeze(1) + ac_score = torch.matmul(rw_head_q, key_layer.transpose(-1, -2)) + rr_head_q = query_layer + r_r_bias.unsqueeze(1) + bd_score = torch.matmul(rr_head_q, relative_layer.transpose(-1, -2)) + bd_score = self._rel_shift(bd_score) # qlen x klen x bsz x n_head + # bd_score = bd_score.permute(2, 3, 0, 1) # bsz n_head qlen klen + + attention_scores = ac_score + bd_score + else: + # Raw attention scores. [b, np, s, s] + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt( + self.hidden_size_per_attention_head) + + # Apply the left to right attention mask. + attention_scores = torch.mul(attention_scores, ltor_mask) - \ + 10000.0 * (1.0 - ltor_mask) + + # Attention probabilities. [b, np, s, s] + attention_probs = torch.nn.Softmax(dim=-1)(attention_scores) + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + # with get_cuda_rng_tracker().fork(): + # attention_probs = self.attention_dropout(attention_probs) + + # Context layer. + # [b, np, s, hn] + # print(f'attn_probs {attention_probs}, value_layer {value_layer}') + context_layer = torch.matmul(attention_probs, value_layer.float()) + # [b, s, np, hn] + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + # [b, s, hp] + context_layer = context_layer.view(*new_context_layer_shape) + + # Output. [b, s, h] + output = self.dense(context_layer) + output = self.output_dropout(output) + + return output + +class GPT2MLP(torch.nn.Module): + """MLP for GPT2. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform gelu transformation, and project the + state back into h hidden dimension. At the end, dropout is also + applied. + + Arguments: + hidden_size: The hidden size of the self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layer initialization. If None, + use `init_method`. + """ + + def __init__(self, hidden_size, output_dropout_prob, init_method, + output_layer_init_method=None): + super(GPT2MLP, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + # Project to 4h. + self.dense_h_to_4h = torch.nn.Linear(hidden_size, 4*hidden_size) + # Project back to h. + self.dense_4h_to_h = torch.nn.Linear(4*hidden_size, hidden_size) + self.dropout = torch.nn.Dropout(output_dropout_prob) + + def forward(self, hidden_states): + # [b, s, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = gelu(intermediate_parallel) + + # [b, s, h] + output = self.dense_4h_to_h(intermediate_parallel) + output = self.dropout(output) + return output + + +class GPT2TransformerLayer(torch.nn.Module): + """A single layer transformer for GPT2. + + We use the following notation: + h: hidden size + n: number of attention heads + b: batch size + s: sequence length + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + + Arguments: + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method: initialization method used for the weights. Note + that all biases are initialized to zero and + layernorm weight are initialized to one. + output_layer_init_method: output layers (attention output and + mlp output) initialization. If None, + use `init_method`. + """ + def __init__(self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + init_method, + output_layer_init_method=None, + relative_encoding=False): + super(GPT2TransformerLayer, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + + # Layernorm on the input data. + self.input_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon) + + # Self attention. + self.attention = GPT2SelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method, + relative_encoding=relative_encoding) + + # Layernorm on the input data. + self.post_attention_layernorm = torch.nn.LayerNorm(hidden_size, + eps=layernorm_epsilon) + + # MLP + self.mlp = GPT2MLP( + hidden_size, + output_dropout_prob, + init_method, + output_layer_init_method=output_layer_init_method) + + def forward(self, hidden_states, ltor_mask, position_embeddings=None, r_w_bias=None, r_r_bias=None, mem=None): + # hidden_states: [b, s, h] + # ltor_mask: [1, 1, s, s] + + # Layer norm at the begining of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + mem = self.input_layernorm(mem) if mem is not None else None + # Self attention. + attention_output = self.attention(layernorm_output, ltor_mask, position_embeddings, r_w_bias, r_r_bias, mem) + # Residual connection. + # print(f'hz {hidden_states.shape}, attn {attention_output.shape}') + layernorm_input = hidden_states + attention_output + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + # MLP. + mlp_output = self.mlp(layernorm_output) + # Second residual connection. + output = layernorm_input + mlp_output + + return output + +class GPT2TransformerForLatent(torch.nn.Module): + """GPT-2 transformer. + + This module takes input from embedding layer and it's output can + be used directly by a logit layer. It consists of L (num-layers) + blocks of: + layer norm + self attention + residual connection + layer norm + mlp + residual connection + followed by a final layer norm. + + Arguments: + num_layers: Number of transformer layers. + hidden_size: The hidden size of the self attention. + num_attention_heads: number of attention head in the self + attention. + attention_dropout_prob: dropout probability of the attention + score in self attention. + output_dropout_prob: dropout probability for the outputs + after self attention and final output. + checkpoint_activations: if True, checkpoint activations. + checkpoint_num_layers: number of layers to checkpoint. This + is basically the chunk size in checkpoitning. + layernorm_epsilon: epsilon used in layernorm to avoid + division by zero. + init_method_std: standard deviation of the init method which has + the form N(0, std). + use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers) + scaling for the output weights ( + output of self attention and mlp). + """ + def __init__(self, + num_layers, + hidden_size, + num_attention_heads, + max_sequence_length, + max_memory_length, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + latent_size = 64, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + use_scaled_init_for_output_weights=True, + relative_encoding=False): + super(GPT2TransformerForLatent, self).__init__() + # Store activation checkpoiting flag. + self.checkpoint_activations = checkpoint_activations + self.checkpoint_num_layers = checkpoint_num_layers + self.max_memory_length = max_memory_length + + self.latent_size = latent_size + # self.linear = nn.Linear(self.latent_size, hidden_size * num_layers, bias=False).float() # different latent vector for each layer + # self.linear_emb = nn.Linear(self.latent_size, hidden_size * num_layers, bias=False).float() + self.linear_emb = nn.Linear(self.latent_size, hidden_size, bias=False).float() + + # torch.nn.init.normal_(self.linear.weight, mean=0.0, std=init_method_std) + torch.nn.init.normal_(self.linear_emb.weight, mean=0.0, std=init_method_std) + + + output_layer_init_method = None + if use_scaled_init_for_output_weights: + output_layer_init_method = scaled_init_method(init_method_std, + num_layers) + # Embeddings dropout + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + self.relative_encoding = relative_encoding + if relative_encoding: + # Relative position embedding + self.position_embeddings = PositionalEmbedding(hidden_size) + # Per attention head and per partition values. + self.hidden_size_per_attention_head = divide(hidden_size, + num_attention_heads) + self.num_attention_heads_per_partition = num_attention_heads + self.r_w_bias = torch.nn.Parameter( + torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)) + self.r_r_bias = torch.nn.Parameter( + torch.Tensor(self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)) + + # Always initialize bias to zero. + with torch.no_grad(): + self.r_w_bias.zero_() + self.r_r_bias.zero_() + else: + # Position embedding (serial). + self.position_embeddings = torch.nn.Embedding(max_sequence_length, + hidden_size) + # Initialize the position embeddings. + torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std) + + def get_layer(): + return GPT2TransformerLayer( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + unscaled_init_method(init_method_std), + output_layer_init_method=output_layer_init_method, + relative_encoding=relative_encoding) + + # Transformer layers. + self.layers = torch.nn.ModuleList( + [get_layer() for _ in range(num_layers)]) + + # Final layer norm before output. + self.final_layernorm = torch.nn.LayerNorm(hidden_size, eps=layernorm_epsilon) + + + def forward(self, hidden_states, attention_mask, latent_state, mems): + batch_size, query_length, hidden_size = hidden_states.size() + # memory_length = self.latent_size + memory_length = mems[0].size(1) if mems else 0 + + # key_length = query_length + memory_length+1 + # attention_mask = attention_mask[:, :, :, -query_length-memory_length-1:] + key_length = query_length + memory_length + attention_mask = attention_mask[:, :, :, -query_length - memory_length:] + + if latent_state is not None: + latent_emb = self.linear_emb(latent_state) + # latent_emb = torch.split(latent_emb.unsqueeze(1), hidden_size, dim=2) + latent_emb = latent_emb.unsqueeze(1) + # print(f'latent_state {latent_state.half()}\n linear_emb {self.linear_emb.weight} \n latent_emb {latent_emb}') + # torch.save(latent_state, '/cognitive_comp/wanghao/experiments/fengshen/latent_state.pt') + # torch.save(self.linear_emb, '/cognitive_comp/wanghao/experiments/fengshen/weight.pt') + + + position_sequence = torch.arange(key_length - 1, -1, -1.0, device=hidden_states.device, + dtype=hidden_states.dtype) + position_embeddings = self.position_embeddings(position_sequence) + + # print(f'pos {position_embeddings.shape}, latent {latent_emb.shape}') + # if latent_state is not None: + # position_embeddings += latent_emb.unsqueeze(0) + # Apply dropout + position_embeddings = self.embedding_dropout(position_embeddings) + + # print(f'latent_emb {latent_emb.shape}, {hidden_states.shape}') + if latent_state is not None: + hidden_states = hidden_states + latent_emb + hidden_states = self.embedding_dropout(hidden_states) + + # latent_mem = self.linear(latent_state.half()) + # latent_mem = torch.split(latent_mem.unsqueeze(1), hidden_size, dim=2) + + if self.max_memory_length > 0: + mem_layers = [hidden_states.detach()] + else: + mem_layers = [] + + for i, layer in enumerate(self.layers): + args = [hidden_states, attention_mask] + if self.relative_encoding: + args += [position_embeddings, self.r_w_bias, self.r_r_bias] + + mem_i = mems[i] if mems else None + # print(f'mems {len(mems)} {mems[0].shape}') + # mem_i = torch.cat((latent_mem[i], mems[i]), 1) if mems else latent_mem[i] + # print(f'mem_i {mem_i.shape}, {mem_i}') + hidden_states = layer(*args, mem=mem_i) + + if latent_state is not None: + hidden_states = hidden_states + latent_emb + + if self.max_memory_length > 0: + mem_layers.append(hidden_states.detach()) + # print(f'mem_layers {len(mem_layers)} mems {len(mems)}') + # Final layer norm. + output = self.final_layernorm(hidden_states) + if self.max_memory_length > 0: + mem_layers = self.update_mems(mem_layers, mems) + + return (output, mem_layers) + + def update_mems(self, hiddens, mems): + memory_length = mems[0].size(1) if mems else 0 + query_length = hiddens[0].size(1) + new_memory_length = min(self.max_memory_length, memory_length + query_length) + new_mems = [] + with torch.no_grad(): + for i in range(len(hiddens)): + if new_memory_length <= query_length: + new_mems.append(hiddens[i][:, -new_memory_length:]) + else: + new_mems.append(torch.cat((mems[i][:, -new_memory_length+query_length:], hiddens[i]), dim=1)) + return new_mems + + +class GPT2ModelForLatent(PreTrainedModel): + """GPT-2 Language model. + + The output of the forward method are the logits (parallel or + serial depending on the `parallel_output` flag. + """ + + def _init_weights(self, module): + """ Initialize the weights """ + pass # to bypass the not implement error + + def __init__(self, config:TransfoXLConfig): + super().__init__(config) + self.config = config + + self.word_embeddings = torch.nn.Embedding(config.vocab_size, config.hidden_size) + + # Transformer + self.transformer = GPT2TransformerForLatent(config.num_layers, + config.hidden_size, + config.num_attention_heads, + config.max_sequence_length, + config.max_memory_length, + config.embedding_dropout_prob, + config.attention_dropout_prob, + config.output_dropout_prob, + config.checkpoint_activations, + config.latent_size, + config.checkpoint_num_layers, + relative_encoding=config.relative_encoding) + + + def forward(self, input_ids, attention_mask, latent_state, mems=None, labels=None, label_ignore=None): + embeddings = self.word_embeddings(input_ids) + + # Transformer. + logits, hidden_layers = self.transformer(embeddings, attention_mask, latent_state, mems) + lm_logits = F.linear(logits, + self.word_embeddings.weight) + + outputs = (lm_logits, hidden_layers) # (bz, sql, vocab), () + if labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + + loss_fct = CrossEntropyLoss(ignore_index=label_ignore, reduce=False) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + loss = torch.sum(loss.view(-1, shift_labels.shape[-1]), -1) + outputs = (loss,) + outputs + + return outputs + + def get_attn_mask(self, seq_length): + # mem_length = self.config.max_memory_length + 1 + mem_length = self.config.max_memory_length + attention_mask = torch.ones((1, seq_length, seq_length + mem_length)) + attention_mask = torch.tril(torch.triu(attention_mask, 1 - seq_length + mem_length), mem_length) + attention_mask = attention_mask.unsqueeze(1) + return attention_mask diff --git a/fengshen/models/DAVAE/__init__.py b/fengshen/models/DAVAE/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ae84b6461f88023821149c4d8a994cfc24e6f38c --- /dev/null +++ b/fengshen/models/DAVAE/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DAVAE model. """ diff --git a/fengshen/models/DAVAE/run_latent_generation.py b/fengshen/models/DAVAE/run_latent_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..f9f099d205279d883df589fe5031ff0fdbcfb32d --- /dev/null +++ b/fengshen/models/DAVAE/run_latent_generation.py @@ -0,0 +1,302 @@ +import re +import torch +import torch.nn.functional as F +from torch.nn.utils.rnn import pad_sequence +import numpy as np +import json +import jsonlines +from tqdm import tqdm, trange + +def set_seed(args): + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + +def filter_noise(text): + space_pattern = '([\u4e00-\u9fa5|0-9|,|。|?|!|@|¥|……|——|《|》|“|”|、|;|:|‘|’|(|)|「|」|【|】|·|~|-|+])\s+([\u4e00-\u9fa5|0-9|,|。|?|!|@|¥|……|——|《|》|“|”|、|;|:|‘|’|(|)|「|」|【|】|·|~|-|+])' + text = re.sub(space_pattern, r'\1\2', text) + text = re.sub(space_pattern, r'\1\2', text) + patterns = ['引用日期.*$', '参考资料.*$', '\[.*\]', '【.*】', '原文地址:', '原文转载:', '本文转自:', '本文摘要:', ''] + for pattern in patterns: + text = re.sub(pattern, "", text) + return text.strip() + +def get_raw_data(raw_data): + train_data = {} + with open(raw_data, 'r', encoding='utf8') as fh: + for line in fh: + line = json.loads(line) + for key in line.keys(): + if key not in train_data.keys(): + train_data[key] = [line[key]] + else: + train_data[key].append(line[key]) + return train_data + +def save_output(input_text, output, output_file): + with jsonlines.open(output_file, mode='a') as writer: + for text_in,text_out in zip(input_text, output): + otc = {} + otc['text_a'] = str(text_in) + otc['text_b'] = str(text_out) + writer.write(otc) + +def enforce_repetition_penalty(lprobs, prev_output_tokens, repetition_penalty = 1.5): + """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """ + for i in range(len(prev_output_tokens)): + for previous_token in set(prev_output_tokens[i]): + # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if lprobs[i, previous_token] < 0: + lprobs[i, previous_token] *= repetition_penalty + else: + lprobs[i, previous_token] /= repetition_penalty + +def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): + """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (vocabulary size) + top_k > 0: keep only top k tokens with highest probability (top-k filtering). + top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + # assert logits.dim() == 1# batch size 1 for now - could be updated for more but the code would be less clear + top_k = min(top_k, logits.size(-1)) # Safety check + if top_k > 0: + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probs > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + for i in range(sorted_indices.size()[0]): + indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]] + logits[i][indices_to_remove] = filter_value + # indices_to_remove = sorted_indices[sorted_indices_to_remove] + # logits[indices_to_remove] = filter_value + return logits + +def sample_sequence_conditional(model, length, context, latent_z=None, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device='cpu'): + + context = torch.tensor(context, dtype=torch.long, device=device) + context = context.unsqueeze(0) + generated = context + with torch.no_grad(): + for i in trange(length): + if i == 2: + generated[generated[:, 1] == 127, 1] = 0 + attention_mask = model.get_attn_mask(generated.shape[1]).to(device) + inputs = {'input_ids': generated, 'latent_state': latent_z, 'attention_mask':attention_mask, 'mems':None} + outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states) + next_token_logits = outputs[0][:, -1, :] / temperature + filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) + + log_probs = F.softmax(filtered_logits, dim=-1) + if repetition_penalty != 1.0: + enforce_repetition_penalty(log_probs, generated, repetition_penalty) + next_token = torch.multinomial(log_probs, num_samples=1) + generated = torch.cat((generated, next_token), dim=1) + # pdb.set_trace() + # if next_token[0,0].item() == decoder_tokenizer.encode('')[0]: + if next_token[0, 0] == 50000: # end of token 50000 + break + + return generated + +def latent_code_from_text(text, tokenizer_encoder, model_vae, args, scale=1.0): + tokenized1 = tokenizer_encoder.encode(text) + coded = torch.Tensor([tokenized1]).long() + with torch.no_grad(): + coded = coded.to(device) + outputs = model_vae.encoder(coded, attention_mask=(coded > 0).float()) + pooled_hidden_fea = outputs[1] + + mean, logvar = model_vae.encoder.linear(pooled_hidden_fea).chunk(2, -1) + std = logvar.mul(0.5).exp() + eps = torch.zeros_like(std).normal_() + + return mean + torch.mul(eps, std)*scale + +def text_from_latent_code(latent_z, model_vae, args, tokenizer_decoder, prompt=None): + bos_token = tokenizer_decoder.convert_tokens_to_ids(tokenizer_decoder.bos_token) + context_tokens = [bos_token] + + if prompt is not None: + context_tokens.append(tokenizer_decoder.encode(prompt)[:-1]) # remove eos token + + out = sample_sequence_conditional( + model=model_vae.decoder, + context=context_tokens, + latent_z=latent_z, + length= args.max_out_length, # Chunyuan: Fix length; or use to complete a sentence + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + repetition_penalty=args.repetition_penalty, + device=device + ) + + out_tokens = out[0, :].tolist() + out_tokens = out_tokens[1:out_tokens.index(50000)] if 50000 in out_tokens else out_tokens # remove bos and eos + text_x1 = tokenizer_decoder.decode(out_tokens, clean_up_tokenization_spaces=True) + + return text_x1 + + +def simulate(model_vae, tokenizer_encoder, tokenizer_decoder, args, sent_input, prompt=None): + latent_z, _ = latent_code_from_text(sent_input, tokenizer_encoder, model_vae, args) + text_analogy = text_from_latent_code(latent_z, model_vae, args, tokenizer_decoder, prompt=prompt) + + return text_analogy + +def switch(next_value, init, is_update): + is_update = is_update.type_as(next_value) + return (1-is_update)*init + is_update*next_value + +def sample_sequence_conditional_batch(model, max_out_length, context_tokens_tensor, context_length_tensor, latent_z=None, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, device='cpu', end_token=50000): + org_context_length = torch.min(context_length_tensor).item() + batch_size = context_tokens_tensor.shape[0] + + generated = context_tokens_tensor[:,:org_context_length] + counter = org_context_length + + output_tokens_lists = [] + output_order = [] + orig_order = torch.LongTensor(list(range(batch_size))) + + with torch.no_grad(): + while counter < max_out_length: + if counter == org_context_length+2: + generated[generated[:,org_context_length] == 127, org_context_length] = 0 + attention_mask = model.get_attn_mask(generated.shape[1]).to(device) + inputs = {'input_ids': generated, 'latent_state': latent_z, 'attention_mask': attention_mask} + outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states) + next_token_logits = outputs[0][:, -1, :] / temperature + filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) + + # if counter == org_context_length: + # filtered_logits[:, 43488] = -float('Inf') # forbid starting with '《' + log_probs = F.softmax(filtered_logits, dim=-1) + + if repetition_penalty != 1.0: + enforce_repetition_penalty(log_probs, generated, repetition_penalty) + + if any(log_probs.sum(dim=-1) <= 0.0) : + break + next_token = torch.multinomial(log_probs, num_samples=1).view(-1) + next_token = switch(next_token, context_tokens_tensor[:, counter], context_length_tensor<=counter) + + if torch.all(next_token == end_token).item(): + break + + stop_idx = next_token == end_token + output_order.extend(orig_order[stop_idx].tolist()) + + finished = generated[stop_idx] + output_tokens_lists.extend(finished.detach().cpu().tolist()) + # continue with non-ending tokens + conti_idx = next_token != end_token + orig_order = orig_order[conti_idx] + generated = generated[conti_idx] + latent_z = latent_z[conti_idx] + + next_token = next_token[conti_idx] + context_tokens_tensor = context_tokens_tensor[conti_idx] + context_length_tensor = context_length_tensor[conti_idx] + batch_size = generated.shape[0] + + generated = torch.cat((generated, next_token.view(batch_size, 1)), dim=-1) + counter += 1 + + output_order.extend(orig_order.tolist()) + generated = generated.detach().cpu().tolist() + output_tokens_lists.extend(generated) + output_tokens_lists = [tokens[:tokens.index(end_token)] if end_token in tokens else tokens for tokens in output_tokens_lists] + + output_tokens_lists = [tokens for _,tokens in sorted(zip(output_order, output_tokens_lists))] + + return output_tokens_lists + +def latent_code_from_text_batch(texts, tokenizer_encoder, model_vae, args): + tokens_tensor_list = [] + for text in texts: + tokens = tokenizer_encoder.encode(text)[:510] + tokens_tensor_list.append(torch.tensor([101]+tokens+[102])) + + coded = pad_sequence(tokens_tensor_list, batch_first=True, padding_value=0).long() + with torch.no_grad(): + coded = coded.to(device) + pooled_hidden_fea = model_vae.encoder(coded, attention_mask=(coded > 0).float())[1] + mean, logvar = model_vae.encoder.linear(pooled_hidden_fea).chunk(2, -1) + + std = logvar.mul(0.5).exp() + eps = torch.zeros_like(std).normal_() + + latent_z = mean + torch.mul(eps, std)*args.std_scale + + return latent_z + +def text_from_latent_code_batch(latent_z, model_vae, args, tokenizer_decoder, prompt=None): + past = latent_z + batch_size = latent_z.shape[0] + bos_token = tokenizer_decoder.convert_tokens_to_ids(tokenizer_decoder.bos_token) + end_token = tokenizer_decoder.convert_tokens_to_ids(tokenizer_decoder.eos_token) + + if prompt is not None: + prompt = [[bos_token] + tokenizer_decoder.encode(text)[:-1] for text in prompt] + else: + prompt = [[bos_token]]*batch_size + + context_tokens_tensor = torch.tensor([[end_token]*args.max_out_length]*batch_size).to(device) # 2-d tensor + context_length_tensor = torch.tensor([1]*batch_size).to(device) + for i in range(batch_size): + context_tokens_tensor[i,:len(prompt[i])] = torch.tensor(prompt[i]).long().to(device) + context_length_tensor[i] = len(prompt[i]) + + # length = 128 # maximum length, but not used + out = sample_sequence_conditional_batch( + model=model_vae.decoder, + max_out_length= args.max_out_length, # Chunyuan: Fix length; or use to complete a sentence + context_tokens_tensor=context_tokens_tensor, + context_length_tensor=context_length_tensor, + latent_z=latent_z, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + repetition_penalty=args.repetition_penalty, + device=device + ) + + out_text = [] + for i, tokens in enumerate(out): + tokens = tokens[len(prompt[i]):] + tokens = tokens[:tokens.index(end_token)] if end_token in tokens else tokens + text = tokenizer_decoder.decode(tokens, clean_up_tokenization_spaces=True) + out_text.append(filter_noise(text)) + return out_text + + +def simulate_batch(model_vae, tokenizer_encoder, tokenizer_decoder, args, sent_inputs, prompt=None): + latent_z = latent_code_from_text_batch(sent_inputs, tokenizer_encoder, model_vae, args) + text_analogy = text_from_latent_code_batch(latent_z, model_vae, args, tokenizer_decoder, prompt=prompt) + return text_analogy + +def simulate_bz(model_vae, tokenizer_encoder, tokenizer_decoder, args, sent_inputs, prompt=None): + latent_z = latent_code_from_text_batch(sent_inputs, tokenizer_encoder, model_vae, args) + return latent_z + +def my_shuffle(x, index): + result = [] + for field in index: + result.append(x[field]) + return result + diff --git a/fengshen/models/GAVAE/GAVAEModel.py b/fengshen/models/GAVAE/GAVAEModel.py new file mode 100644 index 0000000000000000000000000000000000000000..fa74f95fd775ed17c9e25d9564f94c93b50347f8 --- /dev/null +++ b/fengshen/models/GAVAE/GAVAEModel.py @@ -0,0 +1,67 @@ +# -*- encoding: utf-8 -*- +''' +Copyright 2022 The International Digital Economy Academy (IDEA). CCNL team. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@File : GAVAEModel.py +@Time : 2022/11/04 11:35 +@Author : Liang Yuxin +@Version : 1.0 +@Contact : liangyuxin@idea.edu.cn +@License : (C)Copyright 2022-2023, CCNL-IDEA +''' +import torch +from transformers.modeling_utils import PreTrainedModel +from transformers.configuration_utils import PretrainedConfig + +from fengshen.models.DAVAE.DAVAEModel import DAVAEModel +from fengshen.models.GAVAE.gans_model import gans_process + + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class GAVAEPretrainedModel(PreTrainedModel): + def _init_weights(self, module): + """ Initialize the weights """ + pass # to bypass the not implement error + +class GAVAEModel(GAVAEPretrainedModel): + config_class = PretrainedConfig + def __init__(self, config:PretrainedConfig) -> None: + super().__init__(config) + self.config =config + config.device = device + self.gan = gans_process(self.config) + self.vae_model = DAVAEModel(self.config) + + def train_gan(self,encoder_tokenizer,decoder_tokenizer,input_texts): + self.vae_model.set_tokenizers(encoder_tokenizer,decoder_tokenizer) + n = len(input_texts) + inputs_latents = self.vae_model.latent_code_from_text_batch(input_texts) + well_trained_gan = False + while not well_trained_gan: + self.gan_training(inputs_latents) + latent = torch.tensor(self.gan.gen_test(n)) + if not latent.isnan().any(): + well_trained_gan = True + + def generate(self,n): + latent_z = torch.tensor(self.gan.gen_test(n)).to(device) + text = self.vae_model.text_from_latent_code_batch(latent_z,prompt=None) + return text + + def gan_training(self,inputs_latents): + for gt in range(self.config.gan_epoch): + x_train,y_train,x_test,y_test,perm = self.gan.ready_cls(inputs_latents) + # sent_output:latent_z inputs_labels:id of class label + self.gan.cls_train(x_train, y_train) + x2_gen, y_gen, s_gen = self.gan.ready_gen(inputs_latents) + # s_gen:sent_output + self.gan.gen_train(x2_gen, y_gen, s_gen, gt) diff --git a/fengshen/models/GAVAE/gans_model.py b/fengshen/models/GAVAE/gans_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5880acf9c36c6dfd41cf6286f25a93501e64e5e5 --- /dev/null +++ b/fengshen/models/GAVAE/gans_model.py @@ -0,0 +1,484 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset, DataLoader +import numpy as np + + +class MyDataset(Dataset): + def __init__(self, x, y): + self.x = x + self.y = y + self.len = self.x.size(0) + + def __getitem__(self, index): + return self.x[index], self.y[index] + + def __len__(self): + return self.len + + +class MyDataset_new(Dataset): + def __init__(self, x, y, s): + self.x = x + self.y = y + self.s = s + self.len = self.x.size(0) + + def __getitem__(self, index): + return self.x[index], self.y[index], self.s[index] + + def __len__(self): + return self.len + + +class CLS_Net(torch.nn.Module): + + def __init__(self, cls_num, z_dim, cls_batch_size): + super(CLS_Net, self).__init__() + + mini_dim = 256 #256 + + out_input_num = mini_dim + + base_dim = 64 #256 #64 + + self.cls_batch_size = cls_batch_size + self.jie = 1 + + self.fc1 = nn.Linear(z_dim, mini_dim) + self.fc1.weight.data.normal_(0, 0.1) + + self.fc2 = nn.Linear(out_input_num, base_dim) + self.fc2.weight.data.normal_(0, 0.1) + + self.out = nn.Linear(base_dim, cls_num) + self.out.weight.data.normal_(0, 0.1) + + def self_dis(self, a): + max_dim = self.cls_batch_size + jie = self.jie + + all_tag = False + for j in range(a.shape[0]): + col_tag = False + for i in range(a.shape[0]): + tmp = F.pairwise_distance(a[j,:], a[i,:] , p = jie).view(-1,1) + if col_tag == False: + col_dis = tmp + col_tag = True + else: + col_dis = torch.cat((col_dis, tmp), dim = 0) + if all_tag == False: + all_dis = col_dis + all_tag = True + else: + all_dis = torch.cat((all_dis, col_dis), dim = 1) + ''' + print(all_dis.shape) + if all_dis.shape[1] < max_dim: + all_dis = torch.cat((all_dis, all_dis[:,:(max_dim - all_dis.shape[1])]), dim = 1) + print(all_dis.shape) + ''' + return all_dis + + def forward(self, x): + + x = self.fc1(x) + x1 = F.relu(x) + + x2 = self.fc2(x1) + x2 = torch.nn.Dropout(0.1)(x2) #0.3 + x2 = F.relu(x2) + + y = self.out(x2) + + return y, x1 + + +class Gen_Net(torch.nn.Module): + + def __init__(self,input_x2_dim, output_dim): + super(Gen_Net, self).__init__() + + self.x2_input = nn.Linear(input_x2_dim , 60) + self.x2_input.weight.data.normal_(0, 0.1) + + self.fc1 = nn.Linear(60, 128) + self.fc1.weight.data.normal_(0, 0.1) + + self.fc2 = nn.Linear(128, 256) + self.fc2.weight.data.normal_(0, 0.1) + + self.fc3 = nn.Linear(256, 128) + self.fc3.weight.data.normal_(0, 0.1) + + self.out = nn.Linear(128, output_dim) + self.out.weight.data.normal_(0, 0.1) + + def forward(self,x2): + x2 = self.x2_input(x2) + + x = x2 + x = self.fc1(x) + x = F.relu(x) + + x = self.fc2(x) + x = F.relu(x) + + x = self.fc3(x) + x = F.relu(x) + y = self.out(x) + + return y + + +class gans_process(): + + def __init__(self, config): + + #base pare + self.device = config.device + self.cls_num = config.cls_num + self.x2_dim = config.noise_dim + self.z_dim = config.z_dim + + self.cls_lr = config.cls_lr + self.gen_lr = config.gen_lr + self.cls_epoches = config.cls_epoches + self.gen_epoches = config.gen_epoches + self.mse_weight = 1.0 + + self.cls_batch_size = config.cls_batch_size + self.gen_batch_size = config.gen_batch_size + self.eval_batch_size = config.cls_batch_size + self.gen_batch_size = self.cls_batch_size + + #optimer and net + self.cls_net = CLS_Net(self.cls_num, self.z_dim, self.cls_batch_size).to(self.device) + self.cls_optimizer = torch.optim.SGD(self.cls_net.parameters(), + lr = self.cls_lr , weight_decay= 1e-5) + # gen net + self.gen_net = Gen_Net(self.x2_dim, self.z_dim).to(self.device) + + self.gen_optimizer = torch.optim.SGD(self.gen_net.parameters(), + lr = self.gen_lr , weight_decay= 0.01) + + #base loss + self.loss_func = torch.nn.CrossEntropyLoss() + self.loss_mse = torch.nn.MSELoss() + + def freeze_cls(self): + for param in self.cls_net.parameters(): + param.requires_grad = False + + def unfreeze_cls(self): + for param in self.cls_net.parameters(): + param.requires_grad = True + + def freeze_gen(self): + for param in self.gen_net.parameters(): + param.requires_grad = False + + def unfreeze_gen(self): + for param in self.gen_net.parameters(): + param.requires_grad = True + + def labels2genx(self, sample_num): + x = torch.rand(sample_num, self.x2_dim) + return x.to(self.device) + + def pad_batch(self, x): + if int(x.shape[0] % self.cls_batch_size) == 0: + return x + pad_len = self.cls_batch_size - ( x.shape[0] % self.cls_batch_size) + x = torch.cat((x, x[:pad_len]), dim = 0) + return x + + def ready_cls(self, sent_output,perm=None): + sample_num = len(sent_output) + #---------------make fake z--------------- + sent_output = sent_output.to(self.device) + sent_noise = torch.tensor(self.gen_test(sample_num)).to(self.device) + + #--------------handle datas--------------- + x = torch.cat((sent_output, sent_noise), dim = 0 ) + if perm is None: + perm = torch.randperm(len(x)) + x = x[perm] + #add y - only one label per time + multi_label_num = 1 + multi_output_y = torch.tensor([0]*sample_num).unsqueeze(1) + multi_noise_y = torch.zeros([sent_noise.size(0),1], dtype = torch.int) + multi_noise_y = multi_noise_y + multi_label_num + + y = torch.cat((multi_output_y, multi_noise_y), dim = 0).to(self.device) + y = y[perm] + # x_train = x [:self.train_len] + # y_train = y [:self.train_len] + # x_test = x [self.train_len:] + # y_test = y [self.train_len:] + + return x,y,None,None,perm + + def ready_fake(self, sent_output, inputs_labels, inputs_indexs, label2id, perm = None): + + #---------------make fake z--------------- + sent_output = sent_output.to(self.device) + sent_noise = torch.tensor(self.gen_test(inputs_labels, inputs_indexs)).to(self.device) + + #--------------handle datas--------------- + x = sent_noise + y = torch.tensor(inputs_labels).unsqueeze(1) + if perm is None: + perm = torch.randperm(len(x)) + x = x[perm] + y = y[perm] + + return x,y,perm + + def ready_gen(self, sent_output): + #, inputs_labels, inputs_indexs + sent_num = len(sent_output) + sent_output = sent_output.to(self.device) + x2 = self.labels2genx(sent_num) + y = torch.tensor([0]*sent_num).unsqueeze(1).to(self.device) + + return x2, y, sent_output + + def cls_train(self, x, y, if_oneHot = True): + + #init + self.cls_net.train() + self.gen_net.eval() + + self.unfreeze_cls() + self.freeze_gen() + + x = x.to(self.device) + y = y.to(self.device) + + #if oneHot + if if_oneHot: + y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1) + #make dataset + mydataset = MyDataset(x, y) + train_loader = DataLoader(dataset=mydataset, + batch_size=self.cls_batch_size, shuffle=True) + + #training + for epoch in range(self.cls_epoches): + losses = [] + accuracy = [] + for step, (batch_x, batch_y) in enumerate(train_loader): + self.cls_optimizer.zero_grad() + + out, _ = self.cls_net(batch_x) + loss = self.loss_func(out, batch_y) + + #One-side label smoothing -not used + #location 0 real, location 1 fake + batch_y = batch_y * torch.tensor([0.9, 1.0]).to(self.device) + + loss.backward() + self.cls_optimizer.step() + #tqdm + _, predictions = out.max(1) + predictions = predictions.cpu().numpy().tolist() + _,real_y = batch_y.max(1) + real_y = real_y.cpu().numpy().tolist() + + num_correct = np.sum([int(x==y) for x,y in zip(predictions, real_y)]) + running_train_acc = float(num_correct) / float(batch_x.shape[0]) + losses.append(loss) + accuracy.append(running_train_acc) + + + return self.cls_net + + def cls_eval(self, x, y, if_oneHot = True): + + #init + self.cls_net.eval() + x = x.to(self.device) + y = y.to(self.device) + + #if oneHot + if if_oneHot: + y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1) + #make dataset + mydataset = MyDataset(x, y) + train_loader = DataLoader(dataset=mydataset, + batch_size=self.eval_batch_size, shuffle=False) + + losses = [] + accuracy = [] + #evaling + for step, (batch_x, batch_y) in enumerate(train_loader): + out,_ = self.cls_net(batch_x) + loss = self.loss_func(out, batch_y) + + #tqdm + _, predictions = out.max(1) + predictions = predictions.cpu().numpy().tolist() + _,real_y = batch_y.max(1) + real_y = real_y.cpu().numpy().tolist() + + num_correct = np.sum([int(x==y) for x,y in zip(predictions, real_y)]) + running_train_acc = float(num_correct) / float(batch_x.shape[0]) + accuracy.append(running_train_acc) + + + mean_acc = np.mean(accuracy) + return mean_acc + + def cls_real_eval(self, x, y, if_oneHot = True): + + #init + self.cls_net.eval() + x = x.to(self.device) + y = y.to(self.device) + + #if oneHot + if if_oneHot: + y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1) + #make dataset + mydataset = MyDataset(x, y) + train_loader = DataLoader(dataset=mydataset, + batch_size=self.eval_batch_size, shuffle=False) + + rs = 0 + alls = 0 + + #evaling + for step, (batch_x, batch_y) in enumerate(train_loader): + out, _ = self.cls_net(batch_x) + loss = self.loss_func(out, batch_y) + + #tqdm + _, predictions = out.max(1) + predictions = predictions.cpu().numpy().tolist() + _,real_y = batch_y.max(1) + real_y = real_y.cpu().numpy().tolist() + + right_num = np.sum([int( x==y and int(y) != int(self.cls_num-1) ) for x,y in zip(predictions, real_y)]) + all_num = np.sum([int(int(y) != int(self.cls_num-1) ) for x,y in zip(predictions, real_y)]) + + rs = rs + right_num + alls = alls + all_num + + + return rs/alls + + def cls_test(self, x, if_oneHot = True): + + #init + self.cls_net.eval() + x = x.to(self.device) + y = torch.zeros([x.size(0),1], dtype = torch.float).to(self.device) + + #if oneHot + if if_oneHot: + y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1) + #make dataset + mydataset = MyDataset(x, y) + train_loader = DataLoader(dataset=mydataset, + batch_size=self.eval_batch_size, shuffle=False) + + preds = [] + #testing + for step, (batch_x, batch_y) in enumerate(train_loader): + out, _ = self.cls_net(batch_x) + loss = self.loss_func(out, batch_y) + + #tqdm + _, predictions = out.max(1) + predictions = predictions.cpu().numpy().tolist() + preds.extend(predictions) + + return preds + + def gen_train(self, x2, y, s, times): + + #init + self.cls_net.eval() + self.gen_net.train() + + self.freeze_cls() + self.unfreeze_gen() + + #y is gen + cls + y = torch.zeros(y.size(0), self.cls_num).to(self.device).scatter_(1, y.long(), 1) + + #make dataset + mydataset = MyDataset_new(x2, y, s) + train_loader = DataLoader(dataset=mydataset, + batch_size=self.gen_batch_size, shuffle=True) + + #training + for epoch in range(self.gen_epoches): + losses = [] + accuracy = [] + for step, (batch_x2, batch_y, batch_s) in enumerate(train_loader): + + # no zero_grad = make batch_size + if step % 6 == 5: #23 + self.gen_optimizer.zero_grad() + + out = self.gen_net(batch_x2) + + #fearture matching + out, hds = self.cls_net(out) + out2, hds2 = self.cls_net(batch_s.float()) + loss = self.loss_mse(hds, hds2) + loss = loss * pow(0.9, times) + loss.backward() + self.gen_optimizer.step() + + #tqdm + _, predictions = out.max(1) + predictions = predictions.cpu().numpy().tolist() + _, real_y = batch_y.max(1) + real_y = real_y.cpu().numpy().tolist() + + num_correct = np.sum([int(x==y) for x,y in zip(predictions, real_y)]) + running_train_acc = float(num_correct) / float(batch_x2.shape[0]) + losses.append(loss) + accuracy.append(running_train_acc) + + return self.gen_net + + def gen_test(self, sample_num): + + #init + self.gen_net.eval() + x2 = self.labels2genx(sample_num) + #x2: len(inputs_labels) * 80 + y = torch.zeros([sample_num,1], dtype = torch.float) + y = torch.zeros(sample_num, self.z_dim).scatter_(1, y.long(), 1) + y = y.to(self.device) + s = torch.ones((sample_num, self.z_dim)).to(self.device) + + #make dataset + mydataset = MyDataset_new(x2, y, s) + train_loader = DataLoader(dataset=mydataset, + batch_size=self.eval_batch_size, shuffle=False) + + preds = [] + #testing + for step, (batch_x2, batch_y, batch_s) in enumerate(train_loader): + + out = self.gen_net(batch_x2) + + loss = self.loss_mse(out.double(), batch_s.double()) + + predictions = out.cpu().detach().numpy().tolist() + preds.extend(predictions) + + return preds + + +if __name__ == '__main__': + + pass + diff --git a/fengshen/models/PPVAE/__init__.py b/fengshen/models/PPVAE/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a92b6a8083d4f23a890ebe0c8635a94d0328fcea --- /dev/null +++ b/fengshen/models/PPVAE/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch PPVAE model. """ diff --git a/fengshen/models/PPVAE/pluginVAE.py b/fengshen/models/PPVAE/pluginVAE.py new file mode 100644 index 0000000000000000000000000000000000000000..8841d64ca9d2cc63764015053a021103dfee24dd --- /dev/null +++ b/fengshen/models/PPVAE/pluginVAE.py @@ -0,0 +1,180 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader +from transformers.modeling_utils import PreTrainedModel +from transformers.configuration_utils import PretrainedConfig + +from fengshen.models.DAVAE.DAVAEModel import DAVAEModel +from fengshen.models.PPVAE.utils import * + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +class Encoder(nn.Module): + def __init__(self, latent_dim=128, bottle_dim=20) -> None: + super().__init__() + self.fc1 = nn.Linear(latent_dim, latent_dim//2) + self.fc2 = nn.Linear(latent_dim//2, latent_dim//4) + self.mean = nn.Linear(latent_dim//4, bottle_dim) + self.log_var = nn.Linear(latent_dim//4, bottle_dim) + + def kl_loss(self, mean, log_var): + return (-0.5 * (1 + log_var - mean**2 - log_var.exp()).sum(-1)).mean() + + def sampling(self, mean, log_var): + epsilon = torch.randn(mean.shape[0], mean.shape[-1], device=mean.device) + return mean + (log_var / 2).exp() * epsilon.unsqueeze(1) + + def forward(self, z): + ''' + :param z: shape (b, latent_dim) + ''' + z = self.fc1(z) + z = F.leaky_relu(z) + z = F.leaky_relu(self.fc2(z)) + z_mean = self.mean(z) + + z_log_var = self.log_var(z) + kl_loss = self.kl_loss(z_mean, z_log_var) + enc_z = self.sampling(z_mean, z_log_var) + + if not self.training: + enc_z = z_mean + + return enc_z, kl_loss + +class Decoder(nn.Module): + def __init__(self, latent_dim=128, bottle_dim=20) -> None: + super().__init__() + self.fc1 = nn.Linear(bottle_dim, latent_dim//4) + self.fc2 = nn.Linear(latent_dim//4, latent_dim//2) + self.fc3 = nn.Linear(latent_dim//2, latent_dim) + + def forward(self, enc_z): + z = F.leaky_relu(self.fc1(enc_z)) + z = F.leaky_relu(self.fc2(z)) + z = self.fc3(z) + return z + +class PluginVAE(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.kl_weight = config.kl_weight + self.beta = config.beta + self.encoder = Encoder(config.latent_dim, config.bottle_dim) + self.decoder = Decoder(config.latent_dim, config.bottle_dim) + + def set_beta(self, beta): + self.beta = beta + + def forward(self, z): + enc_z, kl_loss = self.encoder(z) + z_out = self.decoder(enc_z) + return z_out, kl_loss + + def loss(self, z): + z_out, kl_loss = self.forward(z) + z_loss = ((z_out-z)**2).mean() + loss = z_loss + self.kl_weight * (kl_loss-self.beta).abs() + return loss, kl_loss + +class PPVAEPretrainedModel(PreTrainedModel): + def _init_weights(self, module): + """ Initialize the weights """ + pass # to bypass the not implement error + +class PPVAEModel(PPVAEPretrainedModel): + config_class = PretrainedConfig + def __init__(self, config:PretrainedConfig) -> None: + super().__init__(config=config) + self.config =config + self.pluginvae = PluginVAE(self.config) + self.vae_model = DAVAEModel(self.config) + + def train_plugin(self,encoder_tokenizer,decoder_tokenizer,input_texts,negative_samples=None): + # 输入:pluginVAE,label,train_data_dict + # 输出:pluginVAE + self.vae_model.set_tokenizers(encoder_tokenizer,decoder_tokenizer) + pos=self.get_latent(input_texts) + pos_batch_size = self.config.batch_size + total_epoch = self.config.total_epoch + pos_dataset = CustomDataset(pos) + pos_dataloader = DataLoader( + pos_dataset, + batch_size=pos_batch_size, + shuffle=True + ) + neg =None + if negative_samples is not None: + neg=self.get_latent(negative_samples) + neg_batch_size = int(pos_batch_size*(neg.shape[0]/pos.shape[0])) + neg_dataset = CustomDataset(neg) + neg_dataloader = DataLoader( + neg_dataset, + batch_size=neg_batch_size, + shuffle=True + ) + optimizer = torch.optim.Adam( + params=self.pluginvae.parameters(), + lr=self.config.ppvae_lr, betas=(self.config.mu, self.config.nu) + ) + gamma = self.config.gamma + iter_num = 0 + early_stopper = EarlyStopping() + min_loss = 10.0 + for epoch in range(total_epoch): + self.pluginvae.train() + total_pos_loss = 0.0 + total_neg_loss = 0.0 + total_loss = 0.0 + total_pos_kl = 0.0 + for i, data in enumerate(pos_dataloader): + if self.config.get_dymanic_beta: + self.pluginvae.set_beta(self.get_beta_weight(iter_num,self.config.beta,self.config.beta_total_step)) + iter_num += 1 + pos_loss,pos_kl = self.pluginvae.loss(data) + neg_loss = 0.0 + if neg is not None: + neg_data = next(iter(neg_dataloader)) + neg_loss,loss_kl = self.pluginvae.loss(neg_data) + if neg_loss.item()>self.config.neg_loss_threshold*pos_loss.item(): + # print("neg_loss exceed, detached") + neg_loss = neg_loss.detach() + total_neg_loss += neg_loss.item() + loss = pos_loss - gamma*neg_loss + optimizer.zero_grad() + loss.backward() + optimizer.step() + + total_pos_loss += pos_loss.item() + total_loss += loss.item() + total_pos_kl += pos_kl.item() + avg_loss = total_loss/len(pos_dataloader) + avg_kl_loss = total_pos_kl/len(pos_dataloader) + if avg_loss None: + super().__init__() + self.data = data + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + # Get data + d = self.data[index] + return d + +class EarlyStopping(): + def __init__(self, tolerance=10, min_delta=0): + + self.tolerance = tolerance + self.min_delta = min_delta + self.counter = 0 + self.early_stop = False + + def __call__(self, train_loss, min_loss): + if (train_loss-min_loss) > self.min_delta: + self.counter +=1 + if self.counter >= self.tolerance: + self.early_stop = True + +# def gen_text_from_center(args,plugin_vae, vae_model, decoder_tokenizer,label,epoch,pos): +# gen_text = [] +# latent_z = gen_latent_center(plugin_vae,pos).to(args.device).repeat((1,1)) +# print("latent_z",latent_z.shape) +# text_analogy = text_from_latent_code_batch(latent_z, vae_model, args, decoder_tokenizer) +# print("label",label) +# print(text_analogy) +# gen_text.extend([(label,y,epoch) for y in text_analogy]) +# text2out(gen_text, '/cognitive_comp/liangyuxin/projects/cond_vae/outputs/test.json') \ No newline at end of file diff --git a/fengshen/models/__init__.py b/fengshen/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9bad5790a5799b96f2e164d825c0b1f8ec0c2dfb --- /dev/null +++ b/fengshen/models/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/fengshen/models/albert/modeling_albert.py b/fengshen/models/albert/modeling_albert.py new file mode 100644 index 0000000000000000000000000000000000000000..7c5298825fb471e0575dabaefb2b8514e5bedcd8 --- /dev/null +++ b/fengshen/models/albert/modeling_albert.py @@ -0,0 +1,1363 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ALBERT model. """ + +import math +import os +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +from packaging import version +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.utils import logging +from transformers import AlbertConfig + + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "albert-base-v2" +_CONFIG_FOR_DOC = "AlbertConfig" +_TOKENIZER_FOR_DOC = "AlbertTokenizer" + + +ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "albert-base-v1", + "albert-large-v1", + "albert-xlarge-v1", + "albert-xxlarge-v1", + "albert-base-v2", + "albert-large-v2", + "albert-xlarge-v2", + "albert-xxlarge-v2", + # See all ALBERT models at https://huggingface.co./models?filter=albert +] + + +def load_tf_weights_in_albert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + print(name) + + for name, array in zip(names, arrays): + original_name = name + + # If saved from the TF HUB module + name = name.replace("module/", "") + + # Renaming and simplifying + name = name.replace("ffn_1", "ffn") + name = name.replace("bert/", "albert/") + name = name.replace("attention_1", "attention") + name = name.replace("transform/", "") + name = name.replace("LayerNorm_1", "full_layer_layer_norm") + name = name.replace("LayerNorm", "attention/LayerNorm") + name = name.replace("transformer/", "") + + # The feed forward layer had an 'intermediate' step which has been abstracted away + name = name.replace("intermediate/dense/", "") + name = name.replace("ffn/intermediate/output/dense/", "ffn_output/") + + # ALBERT attention was split between self and output which have been abstracted away + name = name.replace("/output/", "/") + name = name.replace("/self/", "/") + + # The pooler is a linear layer + name = name.replace("pooler/dense", "pooler") + + # The classifier was simplified to predictions from cls/predictions + name = name.replace("cls/predictions", "predictions") + name = name.replace("predictions/attention", "predictions") + + # Naming was changed to be more explicit + name = name.replace("embeddings/attention", "embeddings") + name = name.replace("inner_group_", "albert_layers/") + name = name.replace("group_", "albert_layer_groups/") + + # Classifier + if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name): + name = "classifier/" + name + + # No ALBERT model currently handles the next sentence prediction task + if "seq_relationship" in name: + name = name.replace("seq_relationship/output_", "sop_classifier/classifier/") + name = name.replace("weights", "weight") + + name = name.split("/") + + # Ignore the gradients applied by the LAMB/ADAM optimizers. + if ( + "adam_m" in name + or "adam_v" in name + or "AdamWeightDecayOptimizer" in name + or "AdamWeightDecayOptimizer_1" in name + or "global_step" in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print(f"Initialize PyTorch weight {name} from {original_name}") + pointer.data = torch.from_numpy(array) + + return model + + +class AlbertEmbeddings(nn.Module): + """ + Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if version.parse(torch.__version__) > version.parse("1.6.0"): + self.register_buffer( + "token_type_ids", + torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), + persistent=False, + ) + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class AlbertAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads}" + ) + + self.num_attention_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + self.attention_head_size = config.hidden_size // config.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.output_dropout = nn.Dropout(config.hidden_dropout_prob) + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pruned_heads = set() + + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + # Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.query = prune_linear_layer(self.query, index) + self.key = prune_linear_layer(self.key, index) + self.value = prune_linear_layer(self.value, index) + self.dense = prune_linear_layer(self.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.num_attention_heads = self.num_attention_heads - len(heads) + self.all_head_size = self.attention_head_size * self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.transpose(2, 1).flatten(2) + + projected_context_layer = self.dense(context_layer) + projected_context_layer_dropout = self.output_dropout(projected_context_layer) + layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout) + return (layernormed_context_layer, attention_probs) if output_attentions else (layernormed_context_layer,) + + +class AlbertLayer(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = AlbertAttention(config) + self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) + self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) + self.activation = ACT2FN[config.hidden_act] + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False + ): + attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions) + + ffn_output = apply_chunking_to_forward( + self.ff_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[0], + ) + hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0]) + + return (hidden_states,) + attention_output[1:] # add attentions if we output them + + def ff_chunk(self, attention_output): + ffn_output = self.ffn(attention_output) + ffn_output = self.activation(ffn_output) + ffn_output = self.ffn_output(ffn_output) + return ffn_output + + +class AlbertLayerGroup(nn.Module): + def __init__(self, config): + super().__init__() + + self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)]) + + def forward( + self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False + ): + layer_hidden_states = () + layer_attentions = () + + for layer_index, albert_layer in enumerate(self.albert_layers): + layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index], output_attentions) + hidden_states = layer_output[0] + + if output_attentions: + layer_attentions = layer_attentions + (layer_output[1],) + + if output_hidden_states: + layer_hidden_states = layer_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if output_hidden_states: + outputs = outputs + (layer_hidden_states,) + if output_attentions: + outputs = outputs + (layer_attentions,) + return outputs # last-layer hidden state, (layer hidden states), (layer attentions) + + +class AlbertTransformer(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size) + self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + hidden_states = self.embedding_hidden_mapping_in(hidden_states) + + all_hidden_states = (hidden_states,) if output_hidden_states else None + all_attentions = () if output_attentions else None + + head_mask = [None] * self.config.num_hidden_layers if head_mask is None else head_mask + + for i in range(self.config.num_hidden_layers): + # Number of layers in a hidden group + layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups) + + # Index of the hidden group + group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) + + layer_group_output = self.albert_layer_groups[group_idx]( + hidden_states, + attention_mask, + head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], + output_attentions, + output_hidden_states, + ) + hidden_states = layer_group_output[0] + + if output_attentions: + all_attentions = all_attentions + layer_group_output[-1] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class AlbertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = AlbertConfig + load_tf_weights = load_tf_weights_in_albert + base_model_prefix = "albert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@dataclass +class AlbertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.AlbertForPreTraining`. + + Args: + loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + sop_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + sop_logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +ALBERT_START_DOCSTRING = r""" + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Args: + config (:class:`~transformers.AlbertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +ALBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.AlbertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.__call__` and :meth:`transformers.PreTrainedTokenizer.encode` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.", + ALBERT_START_DOCSTRING, +) +class AlbertModel(AlbertPreTrainedModel): + + config_class = AlbertConfig + base_model_prefix = "albert" + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + + self.config = config + self.embeddings = AlbertEmbeddings(config) + self.encoder = AlbertTransformer(config) + if add_pooling_layer: + self.pooler = nn.Linear(config.hidden_size, config.hidden_size) + self.pooler_activation = nn.Tanh() + else: + self.pooler = None + self.pooler_activation = None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has + a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT + model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers. + + These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer, + while [2,3] correspond to the two inner groups of the second hidden layer. + + Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more + information about head pruning + """ + for layer, heads in heads_to_prune.items(): + group_idx = int(layer / self.config.inner_group_num) + inner_group_idx = int(layer - group_idx * self.config.inner_group_num) + self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # + extended_attention_mask = attention_mask[:, None, :, :] + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + encoder_outputs = self.encoder( + embedding_output, + extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + + pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a + `sentence order prediction (classification)` head. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForPreTraining(AlbertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.albert = AlbertModel(config) + self.predictions = AlbertMLMHead(config) + self.sop_classifier = AlbertSOPHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.predictions.decoder = new_embeddings + + def get_input_embeddings(self): + return self.albert.embeddings.word_embeddings + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + sentence_order_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence + A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A). + + Returns: + + Example:: + + >>> from transformers import AlbertTokenizer, AlbertForPreTraining + >>> import torch + + >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') + >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2') + + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + >>> outputs = model(input_ids) + + >>> prediction_logits = outputs.prediction_logits + >>> sop_logits = outputs.sop_logits + + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + + prediction_scores = self.predictions(sequence_output) + sop_scores = self.sop_classifier(pooled_output) + + total_loss = None + if labels is not None and sentence_order_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) + total_loss = masked_lm_loss + sentence_order_loss + + if not return_dict: + output = (prediction_scores, sop_scores) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return AlbertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + sop_logits=sop_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class AlbertMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + + self.LayerNorm = nn.LayerNorm(config.embedding_size) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.dense = nn.Linear(config.hidden_size, config.embedding_size) + self.decoder = nn.Linear(config.embedding_size, config.vocab_size) + self.activation = ACT2FN[config.hidden_act] + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.decoder(hidden_states) + + prediction_scores = hidden_states + + return prediction_scores + + def _tie_weights(self): + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias + + +class AlbertSOPHead(nn.Module): + def __init__(self, config): + super().__init__() + + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, pooled_output): + dropout_pooled_output = self.dropout(pooled_output) + logits = self.classifier(dropout_pooled_output) + return logits + + +@add_start_docstrings( + "Albert Model with a `language modeling` head on top.", + ALBERT_START_DOCSTRING, +) +class AlbertForMaskedLM(AlbertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + + self.albert = AlbertModel(config, add_pooling_layer=False) + self.predictions = AlbertMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.predictions.decoder = new_embeddings + + def get_input_embeddings(self): + return self.albert.embeddings.word_embeddings + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_outputs = outputs[0] + + prediction_scores = self.predictions(sequence_outputs) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForSequenceClassification(AlbertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.albert = AlbertModel(config) + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., + config.num_labels - 1]``. If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForTokenClassification(AlbertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.albert = AlbertModel(config, add_pooling_layer=False) + classifier_dropout_prob = ( + config.classifier_dropout_prob + if config.classifier_dropout_prob is not None + else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ALBERT_START_DOCSTRING, +) +class AlbertForQuestionAnswering(AlbertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.albert = AlbertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForMultipleChoice(AlbertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.albert = AlbertModel(config) + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + outputs = self.albert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fengshen/models/auto/__init__.py b/fengshen/models/auto/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef185f32cc2d9f9b30db1a6a681ce2df34936351 --- /dev/null +++ b/fengshen/models/auto/__init__.py @@ -0,0 +1,56 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from transformers.file_utils import _LazyModule, is_torch_available + + +_import_structure = { + "auto_factory": ["get_values"], + "configuration_auto": ["ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"], + "tokenization_auto": ["TOKENIZER_MAPPING", "AutoTokenizer"], +} + +if is_torch_available(): + _import_structure["modeling_auto"] = [ + "AutoModel", + "AutoModelForMaskedLM", + "AutoModelForMultipleChoice", + "AutoModelForPreTraining", + "AutoModelForQuestionAnswering", + "AutoModelForSequenceClassification", + "AutoModelForTokenClassification", + ] + +if TYPE_CHECKING: + from .auto_factory import get_values + from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig + from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer + if is_torch_available(): + from .modeling_auto import ( + AutoModel, + AutoModelForMaskedLM, + AutoModelForMultipleChoice, + AutoModelForPreTraining, + AutoModelForQuestionAnswering, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/fengshen/models/auto/auto_factory.py b/fengshen/models/auto/auto_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..688bbd4853284305d047be0552077f721e2f97de --- /dev/null +++ b/fengshen/models/auto/auto_factory.py @@ -0,0 +1,644 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Factory function to build auto-model classes.""" +import importlib +from collections import OrderedDict + +from transformers.configuration_utils import PretrainedConfig +from transformers.file_utils import copy_func +from transformers.utils import logging +from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings +from .dynamic import get_class_from_dynamic_module + + +logger = logging.get_logger(__name__) + + +CLASS_DOCSTRING = """ + This is a generic model class that will be instantiated as one of the model classes of the library when created + with the [`~BaseAutoModelClass.from_pretrained`] class method or the [`~BaseAutoModelClass.from_config`] class + method. + + This class cannot be instantiated directly using `__init__()` (throws an error). +""" + +FROM_CONFIG_DOCSTRING = """ + Instantiates one of the model classes of the library from a configuration. + + Note: + Loading a model from its configuration file does **not** load the model weights. It only affects the + model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model weights. + + Args: + config ([`PretrainedConfig`]): + The model class to instantiate is selected based on the configuration class: + + List options + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download configuration from huggingface.co and cache. + >>> config = AutoConfig.from_pretrained("checkpoint_placeholder") + >>> model = BaseAutoModelClass.from_config(config) + ``` +""" + +FROM_PRETRAINED_TORCH_DOCSTRING = """ + Instantiate one of the model classes of the library from a pretrained model. + + The model class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + The model is set in evaluation mode by default using `model.eval()` (so for instance, dropout modules are + deactivated). To train the model, you should first set it back in training mode with `model.train()` + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + user or organization name, like `dbmdz/bert-base-german-cased`. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In + this case, `from_tf` should be set to `True` and a configuration object should be provided as + `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a + PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + model_args (additional positional arguments, *optional*): + Will be passed along to the underlying model `__init__()` method. + config ([`PretrainedConfig`], *optional*): + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + state_dict (*Dict[str, torch.Tensor]*, *optional*): + A state dictionary to use instead of a state dictionary loaded from saved weights file. + + This option can be used if you want to create a model from a pretrained configuration but load your own + weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and + [`~PreTrainedModel.from_pretrained`] is not a simpler option. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_tf (`bool`, *optional*, defaults to `False`): + Load the model weights from a TensorFlow checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (e.g., not try downloading the model). + revision(`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (additional keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder") + + >>> # Update configuration during loading + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True) + >>> model.config.output_attentions + True + + >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower) + >>> config = AutoConfig.from_pretrained("./tf_model/shortcut_placeholder_tf_model_config.json") + >>> model = BaseAutoModelClass.from_pretrained( + ... "./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index", from_tf=True, config=config + ... ) + ``` +""" + +FROM_PRETRAINED_TF_DOCSTRING = """ + Instantiate one of the model classes of the library from a pretrained model. + + The model class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + user or organization name, like `dbmdz/bert-base-german-cased`. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this + case, `from_pt` should be set to `True` and a configuration object should be provided as `config` + argument. This loading path is slower than converting the PyTorch model in a TensorFlow model + using the provided conversion scripts and loading the TensorFlow model afterwards. + model_args (additional positional arguments, *optional*): + Will be passed along to the underlying model `__init__()` method. + config ([`PretrainedConfig`], *optional*): + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_pt (`bool`, *optional*, defaults to `False`): + Load the model weights from a PyTorch checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (e.g., not try downloading the model). + revision(`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (additional keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder") + + >>> # Update configuration during loading + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True) + >>> model.config.output_attentions + True + + >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower) + >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json") + >>> model = BaseAutoModelClass.from_pretrained( + ... "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config + ... ) + ``` +""" + +FROM_PRETRAINED_FLAX_DOCSTRING = """ + Instantiate one of the model classes of the library from a pretrained model. + + The model class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + user or organization name, like `dbmdz/bert-base-german-cased`. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this + case, `from_pt` should be set to `True` and a configuration object should be provided as `config` + argument. This loading path is slower than converting the PyTorch model in a TensorFlow model + using the provided conversion scripts and loading the TensorFlow model afterwards. + model_args (additional positional arguments, *optional*): + Will be passed along to the underlying model `__init__()` method. + config ([`PretrainedConfig`], *optional*): + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_pt (`bool`, *optional*, defaults to `False`): + Load the model weights from a PyTorch checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (e.g., not try downloading the model). + revision(`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (additional keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder") + + >>> # Update configuration during loading + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True) + >>> model.config.output_attentions + True + + >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower) + >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json") + >>> model = BaseAutoModelClass.from_pretrained( + ... "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config + ... ) + ``` +""" + + +def _get_model_class(config, model_mapping): + supported_models = model_mapping[type(config)] + if not isinstance(supported_models, (list, tuple)): + return supported_models + + name_to_model = {model.__name__: model for model in supported_models} + architectures = getattr(config, "architectures", []) + for arch in architectures: + if arch in name_to_model: + return name_to_model[arch] + elif f"TF{arch}" in name_to_model: + return name_to_model[f"TF{arch}"] + elif f"Flax{arch}" in name_to_model: + return name_to_model[f"Flax{arch}"] + + # If not architecture is set in the config or match the supported models, the first element of the tuple is the + # defaults. + return supported_models[0] + + +class _BaseAutoModelClass: + # Base class for auto models. + _model_mapping = None + + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config, **kwargs): + trust_remote_code = kwargs.pop("trust_remote_code", False) + if hasattr(config, "auto_map") and cls.__name__ in config.auto_map: + if not trust_remote_code: + raise ValueError( + "Loading this model requires you to execute the modeling file in that repo " + "on your local machine. Make sure you have read the code there to avoid malicious use, then set " + "the option `trust_remote_code=True` to remove this error." + ) + if kwargs.get("revision", None) is None: + logger.warn( + "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure " + "no malicious code has been contributed in a newer revision." + ) + class_ref = config.auto_map[cls.__name__] + module_file, class_name = class_ref.split(".") + model_class = get_class_from_dynamic_module( + config.name_or_path, module_file + ".py", class_name, **kwargs) + return model_class._from_config(config, **kwargs) + elif type(config) in cls._model_mapping.keys(): + model_class = _get_model_class(config, cls._model_mapping) + return model_class._from_config(config, **kwargs) + + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + config = kwargs.pop("config", None) + trust_remote_code = kwargs.pop("trust_remote_code", False) + kwargs["_from_auto"] = True + if not isinstance(config, PretrainedConfig): + config, kwargs = AutoConfig.from_pretrained( + pretrained_model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **kwargs + ) + if hasattr(config, "auto_map") and cls.__name__ in config.auto_map: + if not trust_remote_code: + raise ValueError( + f"Loading {pretrained_model_name_or_path} requires you to execute the modeling file in that repo " + "on your local machine. Make sure you have read the code there to avoid malicious use, then set " + "the option `trust_remote_code=True` to remove this error." + ) + if kwargs.get("revision", None) is None: + logger.warn( + "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure " + "no malicious code has been contributed in a newer revision." + ) + class_ref = config.auto_map[cls.__name__] + module_file, class_name = class_ref.split(".") + model_class = get_class_from_dynamic_module( + pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs + ) + return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + elif type(config) in cls._model_mapping.keys(): + model_class = _get_model_class(config, cls._model_mapping) + return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + + @classmethod + def register(cls, config_class, model_class): + """ + Register a new model for this class. + + Args: + config_class ([`PretrainedConfig`]): + The configuration corresponding to the model to register. + model_class ([`PreTrainedModel`]): + The model to register. + """ + if hasattr(model_class, "config_class") and model_class.config_class != config_class: + raise ValueError( + "The model class you are passing has a `config_class` attribute that is not consistent with the " + f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix " + "one of those so they match!" + ) + cls._model_mapping.register(config_class, model_class) + + +def insert_head_doc(docstring, head_doc=""): + if len(head_doc) > 0: + return docstring.replace( + "one of the model classes of the library ", + f"one of the model classes of the library (with a {head_doc} head) ", + ) + return docstring.replace( + "one of the model classes of the library ", "one of the base model classes of the library " + ) + + +def auto_class_update(cls, checkpoint_for_example="bert-base-cased", head_doc=""): + # Create a new class with the right name from the base class + model_mapping = cls._model_mapping + name = cls.__name__ + class_docstring = insert_head_doc(CLASS_DOCSTRING, head_doc=head_doc) + cls.__doc__ = class_docstring.replace("BaseAutoModelClass", name) + + # Now we need to copy and re-register `from_config` and `from_pretrained` as class methods otherwise we can't + # have a specific docstrings for them. + from_config = copy_func(_BaseAutoModelClass.from_config) + from_config_docstring = insert_head_doc( + FROM_CONFIG_DOCSTRING, head_doc=head_doc) + from_config_docstring = from_config_docstring.replace( + "BaseAutoModelClass", name) + from_config_docstring = from_config_docstring.replace( + "checkpoint_placeholder", checkpoint_for_example) + from_config.__doc__ = from_config_docstring + from_config = replace_list_option_in_docstrings( + model_mapping._model_mapping, use_model_types=False)(from_config) + cls.from_config = classmethod(from_config) + + if name.startswith("TF"): + from_pretrained_docstring = FROM_PRETRAINED_TF_DOCSTRING + elif name.startswith("Flax"): + from_pretrained_docstring = FROM_PRETRAINED_FLAX_DOCSTRING + else: + from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING + from_pretrained = copy_func(_BaseAutoModelClass.from_pretrained) + from_pretrained_docstring = insert_head_doc( + from_pretrained_docstring, head_doc=head_doc) + from_pretrained_docstring = from_pretrained_docstring.replace( + "BaseAutoModelClass", name) + from_pretrained_docstring = from_pretrained_docstring.replace( + "checkpoint_placeholder", checkpoint_for_example) + shortcut = checkpoint_for_example.split("/")[-1].split("-")[0] + from_pretrained_docstring = from_pretrained_docstring.replace( + "shortcut_placeholder", shortcut) + from_pretrained.__doc__ = from_pretrained_docstring + from_pretrained = replace_list_option_in_docstrings( + model_mapping._model_mapping)(from_pretrained) + cls.from_pretrained = classmethod(from_pretrained) + return cls + + +def get_values(model_mapping): + result = [] + for model in model_mapping.values(): + if isinstance(model, (list, tuple)): + result += list(model) + else: + result.append(model) + + return result + + +def getattribute_from_module(module, attr): + if attr is None: + return None + if isinstance(attr, tuple): + return tuple(getattribute_from_module(module, a) for a in attr) + if hasattr(module, attr): + return getattr(module, attr) + # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("fengshen") + return getattribute_from_module(transformers_module, attr) + + +class _LazyAutoMapping(OrderedDict): + """ + " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. + + Args: + + - config_mapping: The map model type to config class + - model_mapping: The map model type to model (or tokenizer) class + """ + + def __init__(self, config_mapping, model_mapping): + self._config_mapping = config_mapping + self._reverse_config_mapping = { + v: k for k, v in config_mapping.items()} + self._model_mapping = model_mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + model_type = self._reverse_config_mapping[key.__name__] + if model_type not in self._model_mapping: + raise KeyError(key) + model_name = self._model_mapping[model_type] + return self._load_attr_from_module(model_type, model_name) + + def _load_attr_from_module(self, model_type, attr): + module_name = model_type_to_module_name(model_type) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module( + f".{module_name}", "fengshen.models") + return getattribute_from_module(self._modules[module_name], attr) + + def keys(self): + mapping_keys = [ + self._load_attr_from_module(key, name) + for key, name in self._config_mapping.items() + if key in self._model_mapping.keys() + ] + return mapping_keys + list(self._extra_content.keys()) + + def get(self, key, default): + try: + return self.__getitem__(key) + except KeyError: + return default + + def __bool__(self): + return bool(self.keys()) + + def values(self): + mapping_values = [ + self._load_attr_from_module(key, name) + for key, name in self._model_mapping.items() + if key in self._config_mapping.keys() + ] + return mapping_values + list(self._extra_content.values()) + + def items(self): + mapping_items = [ + ( + self._load_attr_from_module(key, self._config_mapping[key]), + self._load_attr_from_module(key, self._model_mapping[key]), + ) + for key in self._model_mapping.keys() + if key in self._config_mapping.keys() + ] + return mapping_items + list(self._extra_content.items()) + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, item): + if item in self._extra_content: + return True + if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: + return False + model_type = self._reverse_config_mapping[item.__name__] + return model_type in self._model_mapping + + def register(self, key, value): + """ + Register a new model in this mapping. + """ + if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping.keys(): + raise ValueError( + f"'{key}' is already used by a Transformers model.") + + self._extra_content[key] = value diff --git a/fengshen/models/auto/configuration_auto.py b/fengshen/models/auto/configuration_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..81676226e57ca519273b98328a1afe6961c37ce3 --- /dev/null +++ b/fengshen/models/auto/configuration_auto.py @@ -0,0 +1,403 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Config class.""" +import importlib +import re +import warnings +from collections import OrderedDict +from typing import List, Union + +from transformers.configuration_utils import PretrainedConfig +from transformers.file_utils import CONFIG_NAME +from transformers.utils import logging +from .dynamic import get_class_from_dynamic_module + + +logger = logging.get_logger(__name__) + +CONFIG_MAPPING_NAMES = OrderedDict( + [ + # Add configs here + ("roformer", "RoFormerConfig"), + ("longformer", "LongformerConfig"), + ] +) + +CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( + [ + # Add archive maps here + ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("longformer", "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ] +) + +MODEL_NAMES_MAPPING = OrderedDict( + [ + # Add full (and cased) model names here + ("roformer", "Roformer"), + ("longformer", "Longformer"), + ] +) + +SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")]) + + +def model_type_to_module_name(key): + """Converts a config key to the corresponding module.""" + # Special treatment + if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME: + return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key] + + return key.replace("-", "_") + + +def config_class_to_model_type(config): + """Converts a config class name to the corresponding model type""" + for key, cls in CONFIG_MAPPING_NAMES.items(): + if cls == config: + return key + return None + + +class _LazyConfigMapping(OrderedDict): + """ + A dictionary that lazily load its values when they are requested. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + if key not in self._mapping: + raise KeyError(key) + value = self._mapping[key] + module_name = model_type_to_module_name(key) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "fengshen.models") + + return getattr(self._modules[module_name], value) + + def keys(self): + return list(self._mapping.keys()) + list(self._extra_content.keys()) + + def values(self): + return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values()) + + def items(self): + return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items()) + + def __iter__(self): + return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) + + def __contains__(self, item): + return item in self._mapping or item in self._extra_content + + def register(self, key, value): + """ + Register a new configuration in this mapping. + """ + if key in self._mapping.keys(): + raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.") + self._extra_content[key] = value + + +CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES) + + +class _LazyLoadAllMappings(OrderedDict): + """ + A mapping that will load all pairs of key values at the first access (either by indexing, requestions keys, values, + etc.) + + Args: + mapping: The mapping to load. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._initialized = False + self._data = {} + + def _initialize(self): + if self._initialized: + return + warnings.warn( + "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP is deprecated and will be removed in v5 of Transformers. " + "It does not contain all available model checkpoints, far from it. Checkout hf.co/models for that.", + FutureWarning, + ) + + for model_type, map_name in self._mapping.items(): + module_name = model_type_to_module_name(model_type) + module = importlib.import_module(f".{module_name}", "transformers.models") + mapping = getattr(module, map_name) + self._data.update(mapping) + + self._initialized = True + + def __getitem__(self, key): + self._initialize() + return self._data[key] + + def keys(self): + self._initialize() + return self._data.keys() + + def values(self): + self._initialize() + return self._data.values() + + def items(self): + self._initialize() + return self._data.keys() + + def __iter__(self): + self._initialize() + return iter(self._data) + + def __contains__(self, item): + self._initialize() + return item in self._data + + +ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = _LazyLoadAllMappings(CONFIG_ARCHIVE_MAP_MAPPING_NAMES) + + +def _get_class_name(model_class: Union[str, List[str]]): + if isinstance(model_class, (list, tuple)): + return " or ".join([f"[`{c}`]" for c in model_class if c is not None]) + return f"[`{model_class}`]" + + +def _list_model_options(indent, config_to_class=None, use_model_types=True): + if config_to_class is None and not use_model_types: + raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.") + if use_model_types: + if config_to_class is None: + model_type_to_name = {model_type: f"[`{config}`]" for model_type, config in CONFIG_MAPPING_NAMES.items()} + else: + model_type_to_name = { + model_type: _get_class_name(model_class) + for model_type, model_class in config_to_class.items() + if model_type in MODEL_NAMES_MAPPING + } + lines = [ + f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)" + for model_type in sorted(model_type_to_name.keys()) + ] + else: + config_to_name = { + CONFIG_MAPPING_NAMES[config]: _get_class_name(clas) + for config, clas in config_to_class.items() + if config in CONFIG_MAPPING_NAMES + } + config_to_model_name = { + config: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING_NAMES.items() + } + lines = [ + f"{indent}- [`{config_name}`] configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)" + for config_name in sorted(config_to_name.keys()) + ] + return "\n".join(lines) + + +def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True): + def docstring_decorator(fn): + docstrings = fn.__doc__ + lines = docstrings.split("\n") + i = 0 + while i < len(lines) and re.search(r"^(\s*)List options\s*$", lines[i]) is None: + i += 1 + if i < len(lines): + indent = re.search(r"^(\s*)List options\s*$", lines[i]).groups()[0] + if use_model_types: + indent = f"{indent} " + lines[i] = _list_model_options(indent, config_to_class=config_to_class, use_model_types=use_model_types) + docstrings = "\n".join(lines) + else: + raise ValueError( + f"The function {fn} should have an empty 'List options' in its docstring as placeholder, current docstring is:\n{docstrings}" + ) + fn.__doc__ = docstrings + return fn + + return docstring_decorator + + +class AutoConfig: + r""" + This is a generic configuration class that will be instantiated as one of the configuration classes of the library + when created with the [`~AutoConfig.from_pretrained`] class method. + + This class cannot be instantiated directly using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoConfig is designed to be instantiated " + "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + def for_model(cls, model_type: str, *args, **kwargs): + if model_type in CONFIG_MAPPING: + config_class = CONFIG_MAPPING[model_type] + return config_class(*args, **kwargs) + raise ValueError( + f"Unrecognized model identifier: {model_type}. Should contain one of {', '.join(CONFIG_MAPPING.keys())}" + ) + + @classmethod + @replace_list_option_in_docstrings() + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate one of the configuration classes of the library from a pretrained model configuration. + + The configuration class to instantiate is selected based on the `model_type` property of the config object that + is loaded, or when it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or + namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. + - A path to a *directory* containing a configuration file saved using the + [`~PretrainedConfig.save_pretrained`] method, or the [`~PreTrainedModel.save_pretrained`] method, + e.g., `./my_model_directory/`. + - A path or url to a saved configuration JSON *file*, e.g., + `./my_model_directory/configuration.json`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + revision(`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + If `False`, then this function returns just the final configuration object. + + If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a + dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the + part of `kwargs` which has not been used to update `config` and is otherwise ignored. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs(additional keyword arguments, *optional*): + The values in kwargs of any keys which are configuration attributes will be used to override the loaded + values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled + by the `return_unused_kwargs` keyword parameter. + + Examples: + + ```python + >>> from transformers import AutoConfig + + >>> # Download configuration from huggingface.co and cache. + >>> config = AutoConfig.from_pretrained("bert-base-uncased") + + >>> # Download configuration from huggingface.co (user-uploaded) and cache. + >>> config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased") + + >>> # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*). + >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/") + + >>> # Load a specific configuration file. + >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json") + + >>> # Change some config attributes when loading a pretrained config. + >>> config = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False) + >>> config.output_attentions + True + + >>> config, unused_kwargs = AutoConfig.from_pretrained( + ... "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True + ... ) + >>> config.output_attentions + True + + >>> config.unused_kwargs + {'foo': False} + ```""" + kwargs["_from_auto"] = True + kwargs["name_or_path"] = pretrained_model_name_or_path + trust_remote_code = kwargs.pop("trust_remote_code", False) + config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + if "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]: + if not trust_remote_code: + raise ValueError( + f"Loading {pretrained_model_name_or_path} requires you to execute the configuration file in that repo " + "on your local machine. Make sure you have read the code there to avoid malicious use, then set " + "the option `trust_remote_code=True` to remove this error." + ) + if kwargs.get("revision", None) is None: + logger.warn( + "Explicitly passing a `revision` is encouraged when loading a configuration with custom code to " + "ensure no malicious code has been contributed in a newer revision." + ) + class_ref = config_dict["auto_map"]["AutoConfig"] + module_file, class_name = class_ref.split(".") + config_class = get_class_from_dynamic_module( + pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs + ) + return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif "model_type" in config_dict: + config_class = CONFIG_MAPPING[config_dict["model_type"]] + return config_class.from_dict(config_dict, **kwargs) + else: + # Fallback: use pattern matching on the string. + for pattern, config_class in CONFIG_MAPPING.items(): + if pattern in str(pretrained_model_name_or_path): + return config_class.from_dict(config_dict, **kwargs) + + raise ValueError( + f"Unrecognized model in {pretrained_model_name_or_path}. " + f"Should have a `model_type` key in its {CONFIG_NAME}, or contain one of the following strings " + f"in its name: {', '.join(CONFIG_MAPPING.keys())}" + ) + + @staticmethod + def register(model_type, config): + """ + Register a new configuration for this class. + + Args: + model_type (`str`): The model type like "bert" or "gpt". + config ([`PretrainedConfig`]): The config to register. + """ + if issubclass(config, PretrainedConfig) and config.model_type != model_type: + raise ValueError( + "The config you are passing has a `model_type` attribute that is not consistent with the model type " + f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they " + "match!" + ) + CONFIG_MAPPING.register(model_type, config) diff --git a/fengshen/models/auto/dynamic.py b/fengshen/models/auto/dynamic.py new file mode 100644 index 0000000000000000000000000000000000000000..5760f6e9292195674d7096996cf3cc0ac35aa0c4 --- /dev/null +++ b/fengshen/models/auto/dynamic.py @@ -0,0 +1,235 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utilities to dynamically load model and tokenizer from the Hub.""" + +import importlib +import os +import re +import shutil +import sys +from pathlib import Path +from typing import Dict, Optional, Union + +from transformers.file_utils import ( + HF_MODULES_CACHE, + TRANSFORMERS_DYNAMIC_MODULE_NAME, + cached_path, + hf_bucket_url, + is_offline_mode, +) +from transformers.utils import logging + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def init_hf_modules(): + """ + Creates the cache directory for modules with an init, and adds it to the Python path. + """ + # This function has already been executed if HF_MODULES_CACHE already is in the Python path. + if HF_MODULES_CACHE in sys.path: + return + + sys.path.append(HF_MODULES_CACHE) + os.makedirs(HF_MODULES_CACHE, exist_ok=True) + init_path = Path(HF_MODULES_CACHE) / "__init__.py" + if not init_path.exists(): + init_path.touch() + + +def create_dynamic_module(name: Union[str, os.PathLike]): + """ + Creates a dynamic module in the cache directory for modules. + """ + init_hf_modules() + dynamic_module_path = Path(HF_MODULES_CACHE) / name + # If the parent module does not exist yet, recursively create it. + if not dynamic_module_path.parent.exists(): + create_dynamic_module(dynamic_module_path.parent) + os.makedirs(dynamic_module_path, exist_ok=True) + init_path = dynamic_module_path / "__init__.py" + if not init_path.exists(): + init_path.touch() + + +def check_imports(filename): + """ + Check if the current Python environment contains all the libraries that are imported in a file. + """ + with open(filename, "r", encoding="utf-8") as f: + content = f.read() + + # Imports of the form `import xxx` + imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE) + # Imports of the form `from xxx import yyy` + imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE) + # Only keep the top-level module + imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")] + + # Unique-ify and test we got them all + imports = list(set(imports)) + missing_packages = [] + for imp in imports: + try: + importlib.import_module(imp) + except ImportError: + missing_packages.append(imp) + + if len(missing_packages) > 0: + raise ImportError( + "This modeling file requires the following packages that were not found in your environment: " + f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`" + ) + + +def get_class_in_module(class_name, module_path): + """ + Import a module on the cache directory for modules and extract a class from it. + """ + module_path = module_path.replace(os.path.sep, ".") + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def get_class_from_dynamic_module( + pretrained_model_name_or_path: Union[str, os.PathLike], + module_file: str, + class_name: str, + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: bool = False, + proxies: Optional[Dict[str, str]] = None, + use_auth_token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + **kwargs, +): + """ + Extracts a class from a module file, present in the local folder or repository of a model. + + + + Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should + therefore only be called on trusted repos. + + + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced + under a user or organization name, like `dbmdz/bert-base-german-cased`. + - a path to a *directory* containing a configuration file saved using the + [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + + module_file (`str`): + The name of the module file containing the class to look for. + class_name (`str`): + The name of the class to import in the module. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the standard + cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if they + exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + use_auth_token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `transformers-cli login` (stored in `~/.huggingface`). + revision(`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + local_files_only (`bool`, *optional*, defaults to `False`): + If `True`, will only try to load the tokenizer configuration from local files. + + + + Passing `use_auth_token=True` is required when you want to use a private model. + + + + Returns: + `type`: The class, dynamically imported from the module. + + Examples: + + ```python + # Download module *modeling.py* from huggingface.co and cache then extract the class *MyBertModel* from this + # module. + cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel") + ```""" + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file. + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file) + submodule = "local" + else: + module_file_or_url = hf_bucket_url( + pretrained_model_name_or_path, filename=module_file, revision=revision, mirror=None + ) + submodule = pretrained_model_name_or_path.replace("/", os.path.sep) + + try: + # Load from URL or cache if already cached + resolved_module_file = cached_path( + module_file_or_url, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + ) + + except EnvironmentError: + logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.") + raise + + # Check we have all the requirements in our environment + check_imports(resolved_module_file) + + # Now we move the module inside our cached dynamic modules. + full_submodule = TRANSFORMERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule + create_dynamic_module(full_submodule) + submodule_path = Path(HF_MODULES_CACHE) / full_submodule + if submodule == "local": + # We always copy local files (we could hash the file to see if there was a change, and give them the name of + # that hash, to only copy when there is a modification but it seems overkill for now). + # The only reason we do the copy is to avoid putting too many folders in sys.path. + module_name = module_file + shutil.copy(resolved_module_file, submodule_path / module_file) + else: + # The module file will end up being named module_file + the etag. This way we get the benefit of versioning. + resolved_module_file_name = Path(resolved_module_file).name + module_name_parts = [module_file.replace(".py", "")] + resolved_module_file_name.split(".") + module_name = "_".join(module_name_parts) + ".py" + if not (submodule_path / module_name).exists(): + shutil.copy(resolved_module_file, submodule_path / module_name) + + # And lastly we get the class inside our newly created module + final_module = os.path.join(full_submodule, module_name.replace(".py", "")) + return get_class_in_module(class_name, final_module) diff --git a/fengshen/models/auto/modeling_auto.py b/fengshen/models/auto/modeling_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..3805e86d239d63d826092fa811261b2334e608f7 --- /dev/null +++ b/fengshen/models/auto/modeling_auto.py @@ -0,0 +1,272 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Model class.""" + +import warnings +from collections import OrderedDict + +from transformers.utils import logging +from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update +from .configuration_auto import CONFIG_MAPPING_NAMES + + +logger = logging.get_logger(__name__) + + +MODEL_MAPPING_NAMES = OrderedDict( + [ + # Base model mapping + ("roformer", "RoFormerModel"), + ("longformer", "LongformerModel"), + ] +) + +MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( + [ + # Model for pre-training mapping + ("longformer", "LongformerForMaskedLM"), + ] +) + +MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict( + [ + # Model with LM heads mapping + ("roformer", "RoFormerForMaskedLM"), + ("longformer", "LongformerForMaskedLM"), + ] +) + +MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Causal LM mapping + ("roformer", "RoFormerForCausalLM"), + ] +) + + +MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Masked LM mapping + ("roformer", "RoFormerForMaskedLM"), + ("longformer", "LongformerForMaskedLM"), + ] +) + + +MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Seq2Seq Causal LM mapping + ("t5", "T5ForConditionalGeneration"), + + ] +) + +MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("speech-encoder-decoder", "SpeechEncoderDecoderModel"), + ("speech_to_text", "Speech2TextForConditionalGeneration"), + ] +) + +MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Sequence Classification mapping + ("roformer", "RoFormerForSequenceClassification"), + ("longformer", "LongformerForSequenceClassification"), + ] +) + +MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Question Answering mapping + ("roformer", "RoFormerForQuestionAnswering"), + ("longformer", "LongformerForQuestionAnswering"), + ] +) + +MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Table Question Answering mapping + ("tapas", "TapasForQuestionAnswering"), + ] +) + +MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Token Classification mapping + ("roformer", "RoFormerForTokenClassification"), + ("longformer", "LongformerForTokenClassification"), + ] +) + +MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict( + [ + # Model for Multiple Choice mapping + ("roformer", "RoFormerForMultipleChoice"), + ("longformer", "LongformerForMultipleChoice"), + ] +) + + + + +MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) + +MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES) + +MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES) + +MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) + +MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES) + +MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES +) +MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES +) +MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES +) +MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES) + +MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES) + + + +class AutoModel(_BaseAutoModelClass): + _model_mapping = MODEL_MAPPING + + +AutoModel = auto_class_update(AutoModel) + + +class AutoModelForPreTraining(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_PRETRAINING_MAPPING + + +AutoModelForPreTraining = auto_class_update(AutoModelForPreTraining, head_doc="pretraining") + + +# Private on purpose, the public class will add the deprecation warnings. +class _AutoModelWithLMHead(_BaseAutoModelClass): + _model_mapping = MODEL_WITH_LM_HEAD_MAPPING + + +_AutoModelWithLMHead = auto_class_update(_AutoModelWithLMHead, head_doc="language modeling") + + +class AutoModelForCausalLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING + + +AutoModelForCausalLM = auto_class_update(AutoModelForCausalLM, head_doc="causal language modeling") + + +class AutoModelForMaskedLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_MASKED_LM_MAPPING + + +AutoModelForMaskedLM = auto_class_update(AutoModelForMaskedLM, head_doc="masked language modeling") + + +class AutoModelForSeq2SeqLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING + + +AutoModelForSeq2SeqLM = auto_class_update( + AutoModelForSeq2SeqLM, head_doc="sequence-to-sequence language modeling", checkpoint_for_example="t5-base" +) + + +class AutoModelForSequenceClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING + + +AutoModelForSequenceClassification = auto_class_update( + AutoModelForSequenceClassification, head_doc="sequence classification" +) + + +class AutoModelForQuestionAnswering(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING + + +AutoModelForQuestionAnswering = auto_class_update(AutoModelForQuestionAnswering, head_doc="question answering") + + +class AutoModelForTableQuestionAnswering(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + + +AutoModelForTableQuestionAnswering = auto_class_update( + AutoModelForTableQuestionAnswering, + head_doc="table question answering", + checkpoint_for_example="google/tapas-base-finetuned-wtq", +) + + +class AutoModelForTokenClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING + + +AutoModelForTokenClassification = auto_class_update(AutoModelForTokenClassification, head_doc="token classification") + + +class AutoModelForMultipleChoice(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_MULTIPLE_CHOICE_MAPPING + + +AutoModelForMultipleChoice = auto_class_update(AutoModelForMultipleChoice, head_doc="multiple choice") + + + +class AutoModelForSpeechSeq2Seq(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING + + +AutoModelForSpeechSeq2Seq = auto_class_update( + AutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeing" +) + + + +class AutoModelWithLMHead(_AutoModelWithLMHead): + @classmethod + def from_config(cls, config): + warnings.warn( + "The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use " + "`AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and " + "`AutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) + return super().from_config(config) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + warnings.warn( + "The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use " + "`AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and " + "`AutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/fengshen/models/auto/tokenization_auto.py b/fengshen/models/auto/tokenization_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..6555191bef55336708cabc5e9b17c0322318a417 --- /dev/null +++ b/fengshen/models/auto/tokenization_auto.py @@ -0,0 +1,449 @@ +# coding=utf-8 +# Copyright 2021 The IDEA Authors. All rights reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Tokenizer class.""" + +import importlib +import json +import os +from collections import OrderedDict +from pathlib import Path +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union + +from transformers.configuration_utils import PretrainedConfig +from transformers.file_utils import ( + cached_path, + get_list_of_files, + hf_bucket_url, + is_offline_mode, + is_sentencepiece_available, + is_tokenizers_available, +) +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE +from transformers.tokenization_utils_fast import PreTrainedTokenizerFast +from transformers.utils import logging +# from ..encoder_decoder import EncoderDecoderConfig +from .auto_factory import _LazyAutoMapping +from .configuration_auto import ( + CONFIG_MAPPING_NAMES, + AutoConfig, + config_class_to_model_type, + model_type_to_module_name, + replace_list_option_in_docstrings, +) +from .dynamic import get_class_from_dynamic_module + + +logger = logging.get_logger(__name__) + +if TYPE_CHECKING: + # This significantly improves completion suggestion performance when + # the transformers package is used with Microsoft's Pylance language server. + TOKENIZER_MAPPING_NAMES: OrderedDict[str, + Tuple[Optional[str], Optional[str]]] = OrderedDict() +else: + TOKENIZER_MAPPING_NAMES = OrderedDict( + [ + ("roformer", ("RoFormerTokenizer", None)), + ("longformer", ("LongformerTokenizer", None)), + ] + ) + +TOKENIZER_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) + +CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} + + +def tokenizer_class_from_name(class_name: str): + if class_name == "PreTrainedTokenizerFast": + return PreTrainedTokenizerFast + + for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): + if class_name in tokenizers: + module_name = model_type_to_module_name(module_name) + + module = importlib.import_module( + f".{module_name}", "transformers.models") + return getattr(module, class_name) + + for config, tokenizers in TOKENIZER_MAPPING._extra_content.items(): + for tokenizer in tokenizers: + if getattr(tokenizer, "__name__", None) == class_name: + return tokenizer + + return None + + +def get_tokenizer_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: bool = False, + proxies: Optional[Dict[str, str]] = None, + use_auth_token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + **kwargs, +): + """ + Loads the tokenizer configuration from a pretrained model tokenizer configuration. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced + under a user or organization name, like `dbmdz/bert-base-german-cased`. + - a path to a *directory* containing a configuration file saved using the + [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the standard + cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if they + exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + use_auth_token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `transformers-cli login` (stored in `~/.huggingface`). + revision(`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + local_files_only (`bool`, *optional*, defaults to `False`): + If `True`, will only try to load the tokenizer configuration from local files. + + + + Passing `use_auth_token=True` is required when you want to use a private model. + + + + Returns: + `Dict`: The configuration of the tokenizer. + + Examples: + + ```python + # Download configuration from huggingface.co and cache. + tokenizer_config = get_tokenizer_config("bert-base-uncased") + # This model does not have a tokenizer config so the result will be an empty dict. + tokenizer_config = get_tokenizer_config("xlm-roberta-base") + + # Save a pretrained tokenizer locally and you can reload its config + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + tokenizer.save_pretrained("tokenizer-test") + tokenizer_config = get_tokenizer_config("tokenizer-test") + ```""" + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + # Will raise a ValueError if `pretrained_model_name_or_path` is not a valid path or model identifier + repo_files = get_list_of_files( + pretrained_model_name_or_path, + revision=revision, + use_auth_token=use_auth_token, + local_files_only=local_files_only, + ) + if TOKENIZER_CONFIG_FILE not in [Path(f).name for f in repo_files]: + return {} + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join( + pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE) + else: + config_file = hf_bucket_url( + pretrained_model_name_or_path, filename=TOKENIZER_CONFIG_FILE, revision=revision, mirror=None + ) + + try: + # Load from URL or cache if already cached + resolved_config_file = cached_path( + config_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + ) + + except EnvironmentError: + logger.info( + "Could not locate the tokenizer configuration file, will try to use the model config instead.") + return {} + + with open(resolved_config_file, encoding="utf-8") as reader: + return json.load(reader) + + +class AutoTokenizer: + r""" + This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when + created with the [`AutoTokenizer.from_pretrained`] class method. + + This class cannot be instantiated directly using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoTokenizer is designed to be instantiated " + "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES) + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + r""" + Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary. + + The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Params: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a + user or organization name, like `dbmdz/bert-base-german-cased`. + - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved + using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a + single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not + applicable to all derived classes) + inputs (additional positional arguments, *optional*): + Will be passed along to the Tokenizer `__init__()` method. + config ([`PretrainedConfig`], *optional*) + The configuration object used to dertermine the tokenizer class to instantiate. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download the model weights and configuration files and override the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + revision(`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + subfolder (`str`, *optional*): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for + facebook/rag-token-base), specify it here. + use_fast (`bool`, *optional*, defaults to `True`): + Whether or not to try to load the fast version of the tokenizer. + tokenizer_type (`str`, *optional*): + Tokenizer type to be loaded. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (additional keyword arguments, *optional*): + Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like + `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`, + `additional_special_tokens`. See parameters in the `__init__()` for more details. + + Examples: + + ```python + >>> from transformers import AutoTokenizer + + >>> # Download vocabulary from huggingface.co and cache. + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + + >>> # Download vocabulary from huggingface.co (user-uploaded) and cache. + >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased") + + >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*) + >>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/") + ```""" + config = kwargs.pop("config", None) + kwargs["_from_auto"] = True + + use_fast = kwargs.pop("use_fast", True) + tokenizer_type = kwargs.pop("tokenizer_type", None) + trust_remote_code = kwargs.pop("trust_remote_code", False) + + # First, let's see whether the tokenizer_type is passed so that we can leverage it + if tokenizer_type is not None: + tokenizer_class = None + tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get( + tokenizer_type, None) + + if tokenizer_class_tuple is None: + raise ValueError( + f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of " + f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}." + ) + + tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple + + if use_fast and tokenizer_fast_class_name is not None: + tokenizer_class = tokenizer_class_from_name( + tokenizer_fast_class_name) + + if tokenizer_class is None: + tokenizer_class = tokenizer_class_from_name( + tokenizer_class_name) + + if tokenizer_class is None: + raise ValueError( + f"Tokenizer class {tokenizer_class_name} is not currently imported.") + + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + # Next, let's try to use the tokenizer_config file to get the tokenizer class. + tokenizer_config = get_tokenizer_config( + pretrained_model_name_or_path, **kwargs) + + config_tokenizer_class = tokenizer_config.get("tokenizer_class") + tokenizer_auto_map = tokenizer_config.get("auto_map") + + # If that did not work, let's try to use the config. + if config_tokenizer_class is None: + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + config_tokenizer_class = config.tokenizer_class + if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map: + tokenizer_auto_map = config.auto_map["AutoTokenizer"] + + # If we have the tokenizer class from the tokenizer config or the model config we're good! + if config_tokenizer_class is not None: + tokenizer_class = None + if tokenizer_auto_map is not None: + if not trust_remote_code: + raise ValueError( + f"Loading {pretrained_model_name_or_path} requires you to execute the tokenizer file in that repo " + "on your local machine. Make sure you have read the code there to avoid malicious use, then set " + "the option `trust_remote_code=True` to remove this error." + ) + if kwargs.get("revision", None) is None: + logger.warn( + "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure " + "no malicious code has been contributed in a newer revision." + ) + + if use_fast and tokenizer_auto_map[1] is not None: + class_ref = tokenizer_auto_map[1] + else: + class_ref = tokenizer_auto_map[0] + + module_file, class_name = class_ref.split(".") + tokenizer_class = get_class_from_dynamic_module( + pretrained_model_name_or_path, module_file + ".py", class_name, **kwargs + ) + + elif use_fast and not config_tokenizer_class.endswith("Fast"): + tokenizer_class_candidate = f"{config_tokenizer_class}Fast" + tokenizer_class = tokenizer_class_from_name( + tokenizer_class_candidate) + if tokenizer_class is None: + tokenizer_class_candidate = config_tokenizer_class + tokenizer_class = tokenizer_class_from_name( + tokenizer_class_candidate) + + if tokenizer_class is None: + raise ValueError( + f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." + ) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + model_type = config_class_to_model_type(type(config).__name__) + if model_type is not None: + tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type( + config)] + if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): + return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + if tokenizer_class_py is not None: + return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + raise ValueError( + "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " + "in order to use this tokenizer." + ) + + raise ValueError( + f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n" + f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}." + ) + + def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None): + """ + Register a new tokenizer in this mapping. + + + Args: + config_class ([`PretrainedConfig`]): + The configuration corresponding to the model to register. + slow_tokenizer_class ([`PretrainedTokenizer`], *optional*): + The slow tokenizer to register. + slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*): + The fast tokenizer to register. + """ + if slow_tokenizer_class is None and fast_tokenizer_class is None: + raise ValueError( + "You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class") + if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast): + raise ValueError( + "You passed a fast tokenizer in the `slow_tokenizer_class`.") + if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer): + raise ValueError( + "You passed a slow tokenizer in the `fast_tokenizer_class`.") + + if ( + slow_tokenizer_class is not None + and fast_tokenizer_class is not None + and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast) + and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class + ): + raise ValueError( + "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not " + "consistent with the slow tokenizer class you passed (fast tokenizer has " + f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those " + "so they match!" + ) + + # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones. + if config_class in TOKENIZER_MAPPING._extra_content: + existing_slow, existing_fast = TOKENIZER_MAPPING[config_class] + if slow_tokenizer_class is None: + slow_tokenizer_class = existing_slow + if fast_tokenizer_class is None: + fast_tokenizer_class = existing_fast + + TOKENIZER_MAPPING.register( + config_class, (slow_tokenizer_class, fast_tokenizer_class)) diff --git a/fengshen/models/bart/modeling_bart.py b/fengshen/models/bart/modeling_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a58ac8036fbc0bb9334b083b12a5599950d355 --- /dev/null +++ b/fengshen/models/bart/modeling_bart.py @@ -0,0 +1,423 @@ +import warnings +from pytorch_lightning import LightningModule +from fengshen.models import transformer_utils + +import torch +import torch.utils.checkpoint +from torch import nn +import torch.nn.functional as F + +from dataclasses import dataclass +from typing import Optional, Tuple + +from transformers.file_utils import * +from transformers.modeling_outputs import * +from transformers.models.bart import * +from transformers.models.bart.modeling_bart import BartClassificationHead + + +_CONFIG_FOR_DOC = "BartConfig" + + +# ------------------------ ZZ: CBart addition ------------------------ + + +def _reorder_buffer(attn_cache, new_order): + for k, input_buffer_k in attn_cache.items(): + if input_buffer_k is not None: + attn_cache[k] = input_buffer_k.index_select(0, new_order) + return attn_cache + + +def _make_linear_from_emb(emb): + vocab_size, emb_size = emb.weight.shape + lin_layer = nn.Linear(vocab_size, emb_size, bias=False) + lin_layer.weight.data = emb.weight.data + return lin_layer + + +BART_GENERATION_EXAMPLE = r""" + Summarization example:: + + >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig + + >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') + >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) + >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) + + Mask filling example:: + + >>> from transformers import BartTokenizer, BartForConditionalGeneration + >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') + >>> TXT = "My friends are but they eat too many carbs." + + >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large') + >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids'] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = logits[0, masked_index].softmax(dim=0) + >>> values, predictions = probs.topk(5) + + >>> tokenizer.decode(predictions).split() +""" + + +@dataclass +class CBartLMOutput(ModelOutput): + """ + Base class for CBart specific language models outputs. + + Args: + .... + """ + loss: Optional[torch.FloatTensor] = None + encoder_loss: Optional[torch.FloatTensor] = None + decoder_loss: Optional[torch.FloatTensor] = None + encoder_logits: torch.FloatTensor = None + logits: torch.FloatTensor = None + past_key_values: Optional[Tuple[torch.FloatTensor]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class BartForTextInfill(BartPretrainedModel): + """ + this class is designed for text infilling. + During training, the encoder is used to predict replace, insert, + and the decoder is used to generate original input. + Compared with BartForConditionalGeneration class, + we add a module over the encoder and add a new loss for the encoder. + """ + base_model_prefix = "model" + authorized_missing_keys = [r"final_logits_bias", + r"encoder\.version", r"decoder\.version"] + + def __init__(self, config: BartConfig): + super().__init__(config) + base_model = BartModel(config) + self.model = base_model + self.register_buffer("final_logits_bias", torch.zeros( + (1, self.model.shared.num_embeddings))) + # print( config.encoder_loss_type, config.num_labels) + + # add a new attribute into BartConfig class (revise BartConfig) + self.encoder_loss_type = config.encoder_loss_type + self.num_labels = config.num_labels + if self.encoder_loss_type == 0: # 0 is classification loss, 1 is regression loss + # add a classification module for the encoder + self.classification_head = BartClassificationHead( + config.d_model, config.d_model, config.num_labels, config.classif_dropout, + ) + else: + # add a regression module for the encoder + self.classification_head = BartClassificationHead( + config.d_model, config.d_model, 1, config.classif_dropout, + ) + + self.model._init_weights(self.classification_head.dense) + self.model._init_weights(self.classification_head.out_proj) + self.loss_weight = config.loss_weight + self.register_buffer("label_weights", torch.zeros((self.num_labels))) + + def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding: + old_num_tokens = self.model.shared.num_embeddings + new_embeddings = super().resize_token_embeddings(new_num_tokens) + self.model.shared = new_embeddings + self._resize_final_logits_bias(new_num_tokens, old_num_tokens) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int, old_num_tokens: int) -> None: + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), + device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BART_GENERATION_EXAMPLE) + def forward( + self, + input_ids, + attention_mask=None, + encoder_outputs=None, + decoder_input_ids=None, + decoder_attention_mask=None, + past_key_values=None, + encoder_labels=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=True, + **unused, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring). + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens + with labels in ``[0, ..., config.vocab_size]``. + + Returns: + + Conditional generation example:: + + # Mask filling only works for bart-large + from transformers import BartTokenizer, BartForConditionalGeneration + tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') + TXT = "My friends are but they eat too many carbs." + + model = BartForConditionalGeneration.from_pretrained('facebook/bart-large') + input_ids = tokenizer([TXT], return_tensors='pt')['input_ids'] + logits = model(input_ids).logits + + masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + probs = logits[0, masked_index].softmax(dim=0) + values, predictions = probs.topk(5) + + tokenizer.decode(predictions).split() + # ['good', 'great', 'all', 'really', 'very'] + """ + if "lm_labels" in unused: + warnings.warn( + "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + FutureWarning, + ) + labels = unused.pop("lm_labels") + if "decoder_cached_states" in unused: + warnings.warn( + "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `decoder_past_key_values` instead.", + FutureWarning, + ) + decoder_past_key_values = unused.pop("decoder_cached_states") + return_dict = return_dict if return_dict is not None else False + + if labels is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # logits and loss for the encoder + # last hidden state + encoder_last_hidden_state = outputs['encoder_last_hidden_state'] + # eos_mask = input_ids.eq(self.config.eos_token_id) + # if len(torch.unique(eos_mask.sum(1))) > 1: + # raise ValueError("All examples must have the same number of tokens.") + # sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :] + encoder_logits = self.classification_head(encoder_last_hidden_state) + encoder_loss = None + if encoder_labels is not None: + # classification loss + if self.encoder_loss_type == 0: + # ZZ: seems like MSE loss does not support weighting, so only CEL has weighting applied for now + loss_fct = nn.CrossEntropyLoss(weight=self.label_weights) + encoder_loss = loss_fct( + encoder_logits.view(-1, self.config.num_labels), encoder_labels.view(-1)) + # regression loss + else: + encoder_logits = encoder_logits.view( + encoder_logits.size(0), -1) + encoder_logits = torch.sigmoid( + encoder_logits) * self.num_labels - 0.5 + loss_fct = nn.MSELoss(reduction='none') + _loss = loss_fct(encoder_logits, encoder_labels) + encoder_loss = torch.mean(_loss[encoder_labels >= 0]) + # encoder_loss =_loss[encoder_labels>=0] + + # logits and loss for the decoder + lm_logits = F.linear( + outputs[0], self.model.shared.weight, bias=self.final_logits_bias) + masked_lm_loss = None + if labels is not None: + loss_fct = nn.CrossEntropyLoss() + # TODO(SS): do we need to ignore pad tokens in labels? + masked_lm_loss = loss_fct( + lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + loss = None + if masked_lm_loss is not None and encoder_loss is not None: + loss = encoder_loss * self.loss_weight + masked_lm_loss + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CBartLMOutput( + loss=loss, + encoder_loss=encoder_loss, + decoder_loss=masked_lm_loss, + encoder_logits=encoder_logits, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_inputs_for_generation(self, decoder_input_ids, past, attention_mask, use_cache, **kwargs): + assert past is not None, "past has to be defined for encoder_outputs" + + encoder_outputs, past_key_values = past + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + # change this to avoid caching (presumably for debugging) + "use_cache": use_cache, + } + + def adjust_logits_during_generation(self, logits, cur_len, max_length): + if cur_len == 1: + self._force_token_ids_generation(logits, self.config.bos_token_id) + if cur_len == max_length - 1 and self.config.eos_token_id is not None: + self._force_token_ids_generation(logits, self.config.eos_token_id) + return logits + + def _force_token_ids_generation(self, scores, token_ids) -> None: + """force one of token_ids to be generated by setting prob of all other tokens to 0""" + if isinstance(token_ids, int): + token_ids = [token_ids] + all_but_token_ids_mask = torch.tensor( + [x for x in range(self.config.vocab_size) if x not in token_ids], + dtype=torch.long, + device=next(self.parameters()).device, + ) + assert len( + scores.shape) == 2, "scores should be of rank 2 with shape: [batch_size, vocab_size]" + scores[:, all_but_token_ids_mask] = -float("inf") + + @staticmethod + def _reorder_cache(past, beam_idx): + ((enc_out, enc_mask), past_key_values) = past + reordered_past = [] + for layer_past in past_key_values: + # get the correct batch idx from decoder layer's batch dim for cross and self-attn + layer_past_new = { + attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items() + } + reordered_past.append(layer_past_new) + + new_enc_out = enc_out if enc_out is None else enc_out.index_select( + 0, beam_idx) + new_enc_mask = enc_mask if enc_mask is None else enc_mask.index_select( + 0, beam_idx) + + past = ((new_enc_out, new_enc_mask), reordered_past) + return past + + def get_encoder(self): + return self.model.encoder + + def get_output_embeddings(self): + return _make_linear_from_emb(self.model.shared) # make it on the fly + + def get_encoder_logits(self, input_ids, attention_mask=None): + # print(input_ids, attention_mask) + # encoder_outputs = self.model.get_encoder_outputs( + # self, + # input_ids, + # attention_mask=attention_mask, + # output_attentions=None, + # output_hidden_states=None, + # return_dict=None, + # ) + + encoder_outputs = self.model.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + return_dict=True + ) + # logits and loss for the encoder + # last hidden state + encoder_last_hidden_state = encoder_outputs['last_hidden_state'] + encoder_logits = self.classification_head(encoder_last_hidden_state) + + # classification + if self.encoder_loss_type == 0: + # probs = torch.softmax(encoder_logits,dim=-1) + pass + # regression + else: + encoder_logits = encoder_logits.view(encoder_logits.size(0), -1) + encoder_logits = torch.sigmoid( + encoder_logits) * self.num_labels - 0.5 + return encoder_outputs, encoder_logits + + +class CBartLightning(LightningModule): + @staticmethod + def add_module_specific_args(parent_args): + parser = parent_args.add_argument_group("CBart specific parameters") + parser.add_argument('--num_labels', type=int, default=3) + parser.add_argument('--encoder_loss_type', type=int, default=0) + parser.add_argument('--loss_weight', type=float, default=1.0) + parser.add_argument('--label_weights', type=float, nargs='+', default=[1.0, 1.0, 1.0]) + parser.add_argument('--masked_lm', type=float, default=0) + return parent_args + + def __init__( + self, + args, + **kwargs, + ): + super().__init__() + self.save_hyperparameters(args) + self.model = BartForTextInfill.from_pretrained(args.model_path, num_labels=self.hparams.num_labels, + encoder_loss_type=self.hparams.encoder_loss_type, + loss_weight=self.hparams.loss_weight,) + self.model.label_weights = torch.tensor( + self.hparams.label_weights, dtype=torch.half) + + def forward(self, **inputs): + return self.model(**inputs) + + def training_step(self, batch, batch_idx): + outputs = self(**batch) + return outputs + + def validation_step(self, batch, batch_idx, dataloader_idx=0): + outputs = self(**batch) + val_loss = outputs["loss"] + + return {"loss": val_loss} + + def setup(self, stage=None) -> None: + if stage != "fit": + return + # Get dataloader by calling it - train_dataloader() is called after setup() by default + train_loader = self.trainer._data_connector._train_dataloader_source.dataloader() + + # Calculate total steps + tb_size = self.hparams.train_batchsize * max(1, self.trainer.gpus) + ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs) + self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size + + def configure_optimizers(self): + transformer_utils.configure_optimizers(self) diff --git a/fengshen/models/clip/__init__.py b/fengshen/models/clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8fcc95802f0a32cf3417a68b64c6e37a83813787 --- /dev/null +++ b/fengshen/models/clip/__init__.py @@ -0,0 +1,4 @@ +from .modeling_taiyi_clip import TaiyiCLIPModel +from .processing_taiyi_clip import TaiyiCLIPProcessor + +__all__ = ['TaiyiCLIPModel', 'TaiyiCLIPProcessor'] diff --git a/fengshen/models/clip/configuration_taiyi_clip.py b/fengshen/models/clip/configuration_taiyi_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..46e1645bce1cf72d007dd21868a8fffe44fc41d7 --- /dev/null +++ b/fengshen/models/clip/configuration_taiyi_clip.py @@ -0,0 +1,183 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" CLIP model configuration""" + +# from transformers import MegatronBertConfig as BertConfig +from transformers.models.bert.configuration_bert import BertConfig +from transformers.models.clip.configuration_clip import CLIPVisionConfig +import copy +from collections import OrderedDict +from typing import TYPE_CHECKING, Any, Mapping, Optional + + +if TYPE_CHECKING: + from transformers.processing_utils import ProcessorMixin + from transformers.utils import TensorType + +from transformers.configuration_utils import PretrainedConfig +from transformers.onnx import OnnxConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class TaiyiCLIPConfig(PretrainedConfig): + r""" + [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate + CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating a + configuration with the defaults will yield a similar configuration to that of the CLIP + [openai/clip-vit-base-patch32](https://huggingface.co./openai/clip-vit-base-patch32) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`CLIPVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimentionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import CLIPConfig, CLIPModel + + >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration + >>> configuration = CLIPConfig() + + >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration + >>> model = CLIPModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig + + >>> # Initializing a CLIPText and CLIPVision configuration + >>> config_text = CLIPTextConfig() + >>> config_vision = CLIPVisionConfig() + + >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "clip" + is_composition = True + + def __init__( + self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs + ): + super().__init__(**kwargs) + + # If `_config_dict` exist, we use them for the backward compatibility. + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + if text_config_dict is not None: + text_config = text_config_dict + if vision_config_dict is not None: + vision_config = vision_config_dict + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the CLIPTextConfig with default values.") + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the CLIPVisionConfig with default values.") + + self.text_config = BertConfig(**text_config) + self.vision_config = CLIPVisionConfig(**vision_config) + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + + @classmethod + def from_text_vision_configs(cls, text_config: BertConfig, vision_config: CLIPVisionConfig, **kwargs): + r""" + Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model + configuration. + + Returns: + [`CLIPConfig`]: An instance of a configuration object + """ + + return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["text_config"] = self.text_config.to_dict() + output["vision_config"] = self.vision_config.to_dict() + output["model_type"] = self.__class__.model_type + return output + + +class CLIPOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ] + ) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("logits_per_image", {0: "batch"}), + ("logits_per_text", {0: "batch"}), + ("text_embeds", {0: "batch"}), + ("image_embeds", {0: "batch"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + def generate_dummy_inputs( + self, + processor: "ProcessorMixin", + batch_size: int = -1, + seq_length: int = -1, + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + + text_input_dict = super().generate_dummy_inputs( + processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework + ) + image_input_dict = super().generate_dummy_inputs( + processor.feature_extractor, batch_size=batch_size, framework=framework + ) + return {**text_input_dict, **image_input_dict} + + @property + def default_onnx_opset(self) -> int: + return 14 diff --git a/fengshen/models/clip/modeling_taiyi_clip.py b/fengshen/models/clip/modeling_taiyi_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..e759f41caeb9e1dbc7395a372280e1a4b9cdee1d --- /dev/null +++ b/fengshen/models/clip/modeling_taiyi_clip.py @@ -0,0 +1,253 @@ +import torch +from torch import nn +from transformers.models.clip.modeling_clip import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + CLIP_START_DOCSTRING, + CLIP_TEXT_INPUTS_DOCSTRING, + CLIP_VISION_INPUTS_DOCSTRING, + CLIP_INPUTS_DOCSTRING, + replace_return_docstrings, + CLIPVisionConfig, + CLIPPreTrainedModel, + CLIPVisionTransformer, + CLIPOutput, + CLIPConfig, + clip_loss, +) +from typing import Optional, Tuple, Union +# from transformers import MegatronBertConfig as BertConfig +# from transformers import MegatronBertModel as BertModel +from transformers.models.bert.modeling_bert import BertModel +from transformers.models.bert.configuration_bert import BertConfig +from .configuration_taiyi_clip import TaiyiCLIPConfig + + +@add_start_docstrings(CLIP_START_DOCSTRING) +class TaiyiCLIPModel(CLIPPreTrainedModel): + config_class = TaiyiCLIPConfig + + def __init__(self, config: TaiyiCLIPConfig): + super().__init__(config) + + if not isinstance(config.text_config, BertConfig): + raise ValueError( + "config.text_config is expected to be of type CLIPTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, CLIPVisionConfig): + raise ValueError( + "config.vision_config is expected to be of type CLIPVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = BertModel(text_config) + self.vision_model = CLIPVisionTransformer(vision_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`CLIPTextModel`]. + + Examples: + + ```python + >>> from transformers import CLIPTokenizer, CLIPModel + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + token_type_ids=token_type_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # pooled_output = text_outputs[1] + pooled_output = text_outputs[0][:, 0, :] + text_features = self.text_projection(pooled_output) + + return text_features + + @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`CLIPVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, CLIPModel + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) + ```""" + # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CLIPOutput, config_class=CLIPConfig) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CLIPOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import CLIPProcessor, CLIPModel + + >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") + >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = clip_loss(logits_per_text) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, + image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return CLIPOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) diff --git a/fengshen/models/clip/processing_taiyi_clip.py b/fengshen/models/clip/processing_taiyi_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..25350551ee0d5c543cad8a7d759542459cf32cf9 --- /dev/null +++ b/fengshen/models/clip/processing_taiyi_clip.py @@ -0,0 +1,115 @@ +# coding=utf-8 +# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for Taiyi-CLIP +""" +from transformers.processing_utils import ProcessorMixin +from transformers.tokenization_utils_base import BatchEncoding + + +class TaiyiCLIPProcessor(ProcessorMixin): + r""" + Constructs a Taiyi-CLIP processor which wraps a Taiyi-CLIP feature extractor and a Taiyi-CLIP tokenizer into + a single processor. + + [`TaiyiCLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and + [`BertTokenizerFast`]. See the [`~TaiyiCLIPProcessor.__call__`] and [`~TaiyiCLIPProcessor.decode`] for more + information. + + Args: + feature_extractor ([`CLIPFeatureExtractor`]): + The feature extractor is a required input. + tokenizer ([`BertTokenizerFast`]): + The tokenizer is a required input. + """ + feature_extractor_class = "CLIPFeatureExtractor" + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + + def __init__(self, feature_extractor, tokenizer): + super().__init__(feature_extractor, tokenizer) + self.current_processor = self.feature_extractor + + def __call__(self, text=None, images=None, return_tensors=None, **kwargs): + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + CLIPFeatureExtractor's [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the + doctsring of the above two methods for more information. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a + number of channels, H and W are image height and width. + + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + + if text is None and images is None: + raise ValueError("You have to specify either text or images. Both cannot be none.") + + if text is not None: + encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + + if images is not None: + image_features = self.feature_extractor(images, return_tensors=return_tensors, **kwargs) + + if text is not None and images is not None: + encoding["pixel_values"] = image_features.pixel_values + return encoding + elif text is not None: + return encoding + else: + return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + feature_extractor_input_names = self.feature_extractor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names)) diff --git a/fengshen/models/deberta_v2/modeling_deberta_v2.py b/fengshen/models/deberta_v2/modeling_deberta_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..d7437a1160cabb7f1446ee3c62bc6fa5a02a59ba --- /dev/null +++ b/fengshen/models/deberta_v2/modeling_deberta_v2.py @@ -0,0 +1,1617 @@ +# coding=utf-8 +# Copyright 2020 Microsoft and the Hugging Face Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DeBERTa-v2 model.""" + +import math +from collections.abc import Sequence +from typing import Optional, Tuple, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss + +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutput, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import softmax_backward_data +from transformers.utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from transformers import DebertaV2Config + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "DebertaV2Config" +_TOKENIZER_FOR_DOC = "DebertaV2Tokenizer" +_CHECKPOINT_FOR_DOC = "microsoft/deberta-v2-xlarge" + +DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/deberta-v2-xlarge", + "microsoft/deberta-v2-xxlarge", + "microsoft/deberta-v2-xlarge-mnli", + "microsoft/deberta-v2-xxlarge-mnli", +] + + +# Copied from transformers.models.deberta.modeling_deberta.ContextPooler +class ContextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) + self.dropout = StableDropout(config.pooler_dropout) + self.config = config + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token) + pooled_output = self.dense(context_token) + pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output) + return pooled_output + + @property + def output_dim(self): + return self.config.hidden_size + + +# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2 +class XSoftmax(torch.autograd.Function): + """ + Masked Softmax which is optimized for saving memory + + Args: + input (`torch.tensor`): The input tensor that will apply softmax. + mask (`torch.IntTensor`): + The mask matrix where 0 indicate that element will be ignored in the softmax calculation. + dim (int): The dimension that will apply softmax + + Example: + + ```python + >>> import torch + >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax + + >>> # Make a tensor + >>> x = torch.randn([4, 20, 100]) + + >>> # Create a mask + >>> mask = (x > 0).int() + + >>> # Specify the dimension to apply softmax + >>> dim = -1 + + >>> y = XSoftmax.apply(x, mask, dim) + ```""" + + @staticmethod + def forward(self, input, mask, dim): + self.dim = dim + rmask = ~(mask.to(torch.bool)) + + output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min)) + output = torch.softmax(output, self.dim) + output.masked_fill_(rmask, 0) + self.save_for_backward(output) + return output + + @staticmethod + def backward(self, grad_output): + (output,) = self.saved_tensors + inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output) + return inputGrad, None, None + + @staticmethod + def symbolic(g, self, mask, dim): + import torch.onnx.symbolic_helper as sym_help + from torch.onnx.symbolic_opset9 import masked_fill, softmax + + mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"]) + r_mask = g.op( + "Cast", + g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value), + to_i=sym_help.cast_pytorch_to_onnx["Byte"], + ) + output = masked_fill(g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.dtype).min))) + output = softmax(g, output, dim) + return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.uint8))) + + +# Copied from transformers.models.deberta.modeling_deberta.DropoutContext +class DropoutContext(object): + def __init__(self): + self.dropout = 0 + self.mask = None + self.scale = 1 + self.reuse_mask = True + + +# Copied from transformers.models.deberta.modeling_deberta.get_mask +def get_mask(input, local_context): + if not isinstance(local_context, DropoutContext): + dropout = local_context + mask = None + else: + dropout = local_context.dropout + dropout *= local_context.scale + mask = local_context.mask if local_context.reuse_mask else None + + if dropout > 0 and mask is None: + mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool) + + if isinstance(local_context, DropoutContext): + if local_context.mask is None: + local_context.mask = mask + + return mask, dropout + + +# Copied from transformers.models.deberta.modeling_deberta.XDropout +class XDropout(torch.autograd.Function): + """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" + + @staticmethod + def forward(ctx, input, local_ctx): + mask, dropout = get_mask(input, local_ctx) + ctx.scale = 1.0 / (1 - dropout) + if dropout > 0: + ctx.save_for_backward(mask) + return input.masked_fill(mask, 0) * ctx.scale + else: + return input + + @staticmethod + def backward(ctx, grad_output): + if ctx.scale > 1: + (mask,) = ctx.saved_tensors + return grad_output.masked_fill(mask, 0) * ctx.scale, None + else: + return grad_output, None + + +# Copied from transformers.models.deberta.modeling_deberta.StableDropout +class StableDropout(nn.Module): + """ + Optimized dropout module for stabilizing the training + + Args: + drop_prob (float): the dropout probabilities + """ + + def __init__(self, drop_prob): + super().__init__() + self.drop_prob = drop_prob + self.count = 0 + self.context_stack = None + + def forward(self, x): + """ + Call the module + + Args: + x (`torch.tensor`): The input tensor to apply dropout + """ + if self.training and self.drop_prob > 0: + return XDropout.apply(x, self.get_context()) + return x + + def clear_context(self): + self.count = 0 + self.context_stack = None + + def init_context(self, reuse_mask=True, scale=1): + if self.context_stack is None: + self.context_stack = [] + self.count = 0 + for c in self.context_stack: + c.reuse_mask = reuse_mask + c.scale = scale + + def get_context(self): + if self.context_stack is not None: + if self.count >= len(self.context_stack): + self.context_stack.append(DropoutContext()) + ctx = self.context_stack[self.count] + ctx.dropout = self.drop_prob + self.count += 1 + return ctx + else: + return self.drop_prob + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm +class DebertaV2SelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2 +class DebertaV2Attention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = DisentangledSelfAttention(config) + self.output = DebertaV2SelfOutput(config) + self.config = config + + def forward( + self, + hidden_states, + attention_mask, + output_attentions=False, + query_states=None, + relative_pos=None, + rel_embeddings=None, + ): + self_output = self.self( + hidden_states, + attention_mask, + output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + if output_attentions: + self_output, att_matrix = self_output + if query_states is None: + query_states = hidden_states + attention_output = self.output(self_output, query_states) + + if output_attentions: + return (attention_output, att_matrix) + else: + return attention_output + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2 +class DebertaV2Intermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm +class DebertaV2Output(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2 +class DebertaV2Layer(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = DebertaV2Attention(config) + self.intermediate = DebertaV2Intermediate(config) + self.output = DebertaV2Output(config) + + def forward( + self, + hidden_states, + attention_mask, + query_states=None, + relative_pos=None, + rel_embeddings=None, + output_attentions=False, + ): + attention_output = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + if output_attentions: + attention_output, att_matrix = attention_output + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + if output_attentions: + return (layer_output, att_matrix) + else: + return layer_output + + +class ConvLayer(nn.Module): + def __init__(self, config): + super().__init__() + kernel_size = getattr(config, "conv_kernel_size", 3) + groups = getattr(config, "conv_groups", 1) + self.conv_act = getattr(config, "conv_act", "tanh") + self.conv = nn.Conv1d( + config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups + ) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, residual_states, input_mask): + out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() + rmask = (1 - input_mask).bool() + out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0) + out = ACT2FN[self.conv_act](self.dropout(out)) + + layer_norm_input = residual_states + out + output = self.LayerNorm(layer_norm_input).to(layer_norm_input) + + if input_mask is None: + output_states = output + else: + if input_mask.dim() != layer_norm_input.dim(): + if input_mask.dim() == 4: + input_mask = input_mask.squeeze(1).squeeze(1) + input_mask = input_mask.unsqueeze(2) + + input_mask = input_mask.to(output.dtype) + output_states = output * input_mask + + return output_states + + +class DebertaV2Encoder(nn.Module): + """Modified BertEncoder with relative position bias support""" + + def __init__(self, config): + super().__init__() + + self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)]) + self.relative_attention = getattr(config, "relative_attention", False) + + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + + self.position_buckets = getattr(config, "position_buckets", -1) + pos_ebd_size = self.max_relative_positions * 2 + + if self.position_buckets > 0: + pos_ebd_size = self.position_buckets * 2 + + self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size) + + self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")] + + if "layer_norm" in self.norm_rel_ebd: + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True) + + self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None + self.gradient_checkpointing = False + + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None + if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd): + rel_embeddings = self.LayerNorm(rel_embeddings) + return rel_embeddings + + def get_attention_mask(self, attention_mask): + if attention_mask.dim() <= 2: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1) + attention_mask = attention_mask.byte() + elif attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) + + return attention_mask + + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + q = query_states.size(-2) if query_states is not None else hidden_states.size(-2) + relative_pos = build_relative_position( + q, hidden_states.size(-2), bucket_size=self.position_buckets, max_position=self.max_relative_positions + ) + return relative_pos + + def forward( + self, + hidden_states, + attention_mask, + output_hidden_states=True, + output_attentions=False, + query_states=None, + relative_pos=None, + return_dict=True, + ): + if attention_mask.dim() <= 2: + input_mask = attention_mask + else: + input_mask = (attention_mask.sum(-2) > 0).byte() + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) + + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[0] + else: + next_kv = hidden_states + rel_embeddings = self.get_rel_embedding() + output_states = next_kv + for i, layer_module in enumerate(self.layer): + + if output_hidden_states: + all_hidden_states = all_hidden_states + (output_states,) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + output_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + next_kv, + attention_mask, + query_states, + relative_pos, + rel_embeddings, + ) + else: + output_states = layer_module( + next_kv, + attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + ) + + if output_attentions: + output_states, att_m = output_states + + if i == 0 and self.conv is not None: + output_states = self.conv(hidden_states, output_states, input_mask) + + if query_states is not None: + query_states = output_states + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None + else: + next_kv = output_states + + if output_attentions: + all_attentions = all_attentions + (att_m,) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (output_states,) + + if not return_dict: + return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +def make_log_bucket_position(relative_pos, bucket_size, max_position): + sign = np.sign(relative_pos) + mid = bucket_size // 2 + abs_pos = np.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, np.abs(relative_pos)) + log_pos = np.ceil(np.log(abs_pos / mid) / np.log((max_position - 1) / mid) * (mid - 1)) + mid + bucket_pos = np.where(abs_pos <= mid, relative_pos, log_pos * sign).astype(np.int) + return bucket_pos + + +def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1): + """ + Build relative position according to the query and key + + We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key + \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q - + P_k\\) + + Args: + query_size (int): the length of query + key_size (int): the length of key + bucket_size (int): the size of position bucket + max_position (int): the maximum allowed absolute position + + Return: + `torch.LongTensor`: A tensor with shape [1, query_size, key_size] + + """ + q_ids = np.arange(0, query_size) + k_ids = np.arange(0, key_size) + rel_pos_ids = q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0], 1)) + if bucket_size > 0 and max_position > 0: + rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position) + rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long) + rel_pos_ids = rel_pos_ids[:query_size, :] + rel_pos_ids = rel_pos_ids.unsqueeze(0) + return rel_pos_ids + + +@torch.jit.script +# Copied from transformers.models.deberta.modeling_deberta.c2p_dynamic_expand +def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos): + return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)]) + + +@torch.jit.script +# Copied from transformers.models.deberta.modeling_deberta.p2c_dynamic_expand +def p2c_dynamic_expand(c2p_pos, query_layer, key_layer): + return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)]) + + +@torch.jit.script +# Copied from transformers.models.deberta.modeling_deberta.pos_dynamic_expand +def pos_dynamic_expand(pos_index, p2c_att, key_layer): + return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))) + + +class DisentangledSelfAttention(nn.Module): + """ + Disentangled self-attention module + + Parameters: + config (`DebertaV2Config`): + A model config class instance with the configuration to build a new model. The schema is similar to + *BertConfig*, for more details, please refer [`DebertaV2Config`] + + """ + + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.num_attention_heads = config.num_attention_heads + _attention_head_size = config.hidden_size // config.num_attention_heads + self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + + self.share_att_key = getattr(config, "share_att_key", False) + self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else [] + self.relative_attention = getattr(config, "relative_attention", False) + + if self.relative_attention: + self.position_buckets = getattr(config, "position_buckets", -1) + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + self.pos_ebd_size = self.max_relative_positions + if self.position_buckets > 0: + self.pos_ebd_size = self.position_buckets + + self.pos_dropout = StableDropout(config.hidden_dropout_prob) + + if not self.share_att_key: + if "c2p" in self.pos_att_type: + self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True) + if "p2c" in self.pos_att_type: + self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = StableDropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x, attention_heads): + new_x_shape = x.size()[:-1] + (attention_heads, -1) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1)) + + def forward( + self, + hidden_states, + attention_mask, + output_attentions=False, + query_states=None, + relative_pos=None, + rel_embeddings=None, + ): + """ + Call the module + + Args: + hidden_states (`torch.FloatTensor`): + Input states to the module usually the output from previous layer, it will be the Q,K and V in + *Attention(Q,K,V)* + + attention_mask (`torch.ByteTensor`): + An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum + sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j* + th token. + + output_attentions (`bool`, optional): + Whether return the attention matrix. + + query_states (`torch.FloatTensor`, optional): + The *Q* state in *Attention(Q,K,V)*. + + relative_pos (`torch.LongTensor`): + The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with + values ranging in [*-max_relative_positions*, *max_relative_positions*]. + + rel_embeddings (`torch.FloatTensor`): + The embedding of relative distances. It's a tensor of shape [\\(2 \\times + \\text{max_relative_positions}\\), *hidden_size*]. + + + """ + if query_states is None: + query_states = hidden_states + query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads) + key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads) + value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads) + + rel_att = None + # Take the dot product between "query" and "key" to get the raw attention scores. + scale_factor = 1 + if "c2p" in self.pos_att_type: + scale_factor += 1 + if "p2c" in self.pos_att_type: + scale_factor += 1 + scale = math.sqrt(query_layer.size(-1) * scale_factor) + attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2)) / scale + if self.relative_attention: + rel_embeddings = self.pos_dropout(rel_embeddings) + rel_att = self.disentangled_attention_bias( + query_layer, key_layer, relative_pos, rel_embeddings, scale_factor + ) + + if rel_att is not None: + attention_scores = attention_scores + rel_att + attention_scores = attention_scores + attention_scores = attention_scores.view( + -1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1) + ) + + # bsz x height x length x dimension + attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + attention_probs = self.dropout(attention_probs) + context_layer = torch.bmm( + attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer + ) + context_layer = ( + context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1)) + .permute(0, 2, 1, 3) + .contiguous() + ) + new_context_layer_shape = context_layer.size()[:-2] + (-1,) + context_layer = context_layer.view(new_context_layer_shape) + if output_attentions: + return (context_layer, attention_probs) + else: + return context_layer + + def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): + if relative_pos is None: + q = query_layer.size(-2) + relative_pos = build_relative_position( + q, key_layer.size(-2), bucket_size=self.position_buckets, max_position=self.max_relative_positions + ) + if relative_pos.dim() == 2: + relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) + elif relative_pos.dim() == 3: + relative_pos = relative_pos.unsqueeze(1) + # bsz x height x query x key + elif relative_pos.dim() != 4: + raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}") + + att_span = self.pos_ebd_size + relative_pos = relative_pos.long().to(query_layer.device) + + rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0) + if self.share_att_key: + pos_query_layer = self.transpose_for_scores( + self.query_proj(rel_embeddings), self.num_attention_heads + ).repeat(query_layer.size(0) // self.num_attention_heads, 1, 1) + pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads).repeat( + query_layer.size(0) // self.num_attention_heads, 1, 1 + ) + else: + if "c2p" in self.pos_att_type: + pos_key_layer = self.transpose_for_scores( + self.pos_key_proj(rel_embeddings), self.num_attention_heads + ).repeat( + query_layer.size(0) // self.num_attention_heads, 1, 1 + ) # .split(self.all_head_size, dim=-1) + if "p2c" in self.pos_att_type: + pos_query_layer = self.transpose_for_scores( + self.pos_query_proj(rel_embeddings), self.num_attention_heads + ).repeat( + query_layer.size(0) // self.num_attention_heads, 1, 1 + ) # .split(self.all_head_size, dim=-1) + + score = 0 + # content->position + if "c2p" in self.pos_att_type: + scale = math.sqrt(pos_key_layer.size(-1) * scale_factor) + c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2)) + c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1) + c2p_att = torch.gather( + c2p_att, + dim=-1, + index=c2p_pos.squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]), + ) + score += c2p_att / scale + + # position->content + if "p2c" in self.pos_att_type: + scale = math.sqrt(pos_query_layer.size(-1) * scale_factor) + if key_layer.size(-2) != query_layer.size(-2): + r_pos = build_relative_position( + key_layer.size(-2), + key_layer.size(-2), + bucket_size=self.position_buckets, + max_position=self.max_relative_positions, + ).to(query_layer.device) + r_pos = r_pos.unsqueeze(0) + else: + r_pos = relative_pos + + p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1) + p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2)) + p2c_att = torch.gather( + p2c_att, + dim=-1, + index=p2c_pos.squeeze(0).expand([query_layer.size(0), key_layer.size(-2), key_layer.size(-2)]), + ).transpose(-1, -2) + score += p2c_att / scale + + return score + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm +class DebertaV2Embeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + pad_token_id = getattr(config, "pad_token_id", 0) + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id) + + self.position_biased_input = getattr(config, "position_biased_input", True) + if not self.position_biased_input: + self.position_embeddings = None + else: + self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size) + + if config.type_vocab_size > 0: + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size) + + if self.embedding_size != config.hidden_size: + self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = StableDropout(config.hidden_dropout_prob) + self.config = config + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.position_embeddings is not None: + position_embeddings = self.position_embeddings(position_ids.long()) + else: + position_embeddings = torch.zeros_like(inputs_embeds) + + embeddings = inputs_embeds + if self.position_biased_input: + embeddings += position_embeddings + if self.config.type_vocab_size > 0: + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings += token_type_embeddings + + if self.embedding_size != self.config.hidden_size: + embeddings = self.embed_proj(embeddings) + + embeddings = self.LayerNorm(embeddings) + + # if mask is not None: + # if mask.dim() != embeddings.dim(): + # if mask.dim() == 4: + # mask = mask.squeeze(1).squeeze(1) + # mask = mask.unsqueeze(2) + # mask = mask.to(embeddings.dtype) + + # embeddings = embeddings * mask + + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2 +class DebertaV2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DebertaV2Config + base_model_prefix = "deberta" + _keys_to_ignore_on_load_missing = ["position_ids"] + _keys_to_ignore_on_load_unexpected = ["position_embeddings"] + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, DebertaV2Encoder): + module.gradient_checkpointing = value + + +DEBERTA_START_DOCSTRING = r""" + The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled + Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build + on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two + improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior.``` + + + Parameters: + config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEBERTA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.", + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2 +class DebertaV2Model(DebertaV2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embeddings = DebertaV2Embeddings(config) + self.encoder = DebertaV2Encoder(config) + self.z_steps = 0 + self.config = config + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings.word_embeddings = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError("The prune function is not implemented in DeBERTa model.") + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids, + mask=attention_mask, + inputs_embeds=inputs_embeds, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask, + output_hidden_states=True, + output_attentions=output_attentions, + return_dict=return_dict, + ) + encoded_layers = encoder_outputs[1] + + if self.z_steps > 1: + hidden_states = encoded_layers[-2] + layers = [self.encoder.layer[-1] for _ in range(self.z_steps)] + query_states = encoded_layers[-1] + rel_embeddings = self.encoder.get_rel_embedding() + attention_mask = self.encoder.get_attention_mask(attention_mask) + rel_pos = self.encoder.get_rel_pos(embedding_output) + for layer in layers[1:]: + query_states = layer( + hidden_states, + attention_mask, + output_attentions=False, + query_states=query_states, + relative_pos=rel_pos, + rel_embeddings=rel_embeddings, + ) + encoded_layers.append(query_states) + + sequence_output = encoded_layers[-1] + + if not return_dict: + return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states if output_hidden_states else None, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2 +class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.deberta = DebertaV2Model(config) + self.cls = DebertaV2OnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta +class DebertaV2PredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta +class DebertaV2LMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = DebertaV2PredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta +class DebertaV2OnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = DebertaV2LMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +@add_start_docstrings( + """ + DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2 +class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + num_labels = getattr(config, "num_labels", 2) + self.num_labels = num_labels + + self.deberta = DebertaV2Model(config) + self.pooler = ContextPooler(config) + output_dim = self.pooler.output_dim + + self.classifier = nn.Linear(output_dim, num_labels) + drop_out = getattr(config, "cls_dropout", None) + drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out + self.dropout = StableDropout(drop_out) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.deberta.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.deberta.set_input_embeddings(new_embeddings) + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + encoder_layer = outputs[0] + pooled_output = self.pooler(encoder_layer) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + # regression task + loss_fn = nn.MSELoss() + logits = logits.view(-1).to(labels.dtype) + loss = loss_fn(logits, labels.view(-1)) + elif labels.dim() == 1 or labels.size(-1) == 1: + label_index = (labels >= 0).nonzero() + labels = labels.long() + if label_index.size(0) > 0: + labeled_logits = torch.gather( + logits, 0, label_index.expand(label_index.size(0), logits.size(1)) + ) + labels = torch.gather(labels, 0, label_index.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1)) + else: + loss = torch.tensor(0).to(logits) + else: + log_softmax = nn.LogSoftmax(-1) + loss = -((log_softmax(logits) * labels).sum(-1)).mean() + elif self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@add_start_docstrings( + """ + DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2 +class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.deberta = DebertaV2Model(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +@add_start_docstrings( + """ + DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + DEBERTA_START_DOCSTRING, +) +# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2 +class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.deberta = DebertaV2Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.deberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + DEBERTA_START_DOCSTRING, +) +class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + num_labels = getattr(config, "num_labels", 2) + self.num_labels = num_labels + + self.deberta = DebertaV2Model(config) + self.pooler = ContextPooler(config) + output_dim = self.pooler.output_dim + + self.classifier = nn.Linear(output_dim, 1) + drop_out = getattr(config, "cls_dropout", None) + drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out + self.dropout = StableDropout(drop_out) + + self.init_weights() + + def get_input_embeddings(self): + return self.deberta.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.deberta.set_input_embeddings(new_embeddings) + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.deberta( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + encoder_layer = outputs[0] + pooled_output = self.pooler(encoder_layer) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fengshen/models/deepVAE/__init__.py b/fengshen/models/deepVAE/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bcf019eaf0b04fd1c23d0d51d3ea0f1b62d1c306 --- /dev/null +++ b/fengshen/models/deepVAE/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Della model. """ diff --git a/fengshen/models/deepVAE/configuration_della.py b/fengshen/models/deepVAE/configuration_della.py new file mode 100644 index 0000000000000000000000000000000000000000..332e6d71863c3f7266477ea3691a8226b602df01 --- /dev/null +++ b/fengshen/models/deepVAE/configuration_della.py @@ -0,0 +1,130 @@ +# coding=utf-8 +# Copyright 2022 IDEA-CCNL and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Della model configuration """ + +from transformers.configuration_utils import PretrainedConfig + + +Della_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "Della-226M-base": "https://huggingface.co./IDEA-CCNL/Randeng-DELLA-226M-Chinese/resolve/main/config.json" +} + + +class DellaModelConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`~DellaModel`]. + It is used to instantiate an DellaModel model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the DellaModel [Randeng-DELLA-226M-Chinese](https://huggingface.co./IDEA-CCNL/Randeng-DELLA-226M-Chinese) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used + to control the model outputs. Read the documentation from [`PretrainedConfig`] + for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Della model. Defines the number of different + tokens that can be represented by the + `inputs_ids` passed when calling [`~DellaModel`] or + [`~TFDellaModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`~DellaModel`] or + [`~TFDellaModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + +""" + model_type = "DellaModel" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=50257, + n_positions=1024, + n_embd=768, + n_layer=12, + n_head=12, + n_inner=None, + activation_function="gelu_new", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + scale_attn_weights=True, + use_cache=True, + scale_attn_by_inverse_layer_idx=False, + reorder_and_upcast_attn=False, + bos_token_id=21128, + eos_token_id=21129, + pad_token_id=0, + CVAE=False, + latent_dim=256, + **kwargs, + ): + self.vocab_size = vocab_size + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx + self.reorder_and_upcast_attn = reorder_and_upcast_attn + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.CVAE = CVAE + self.latent_dim = latent_dim + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) diff --git a/fengshen/models/deepVAE/deep_vae.py b/fengshen/models/deepVAE/deep_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..08f03849469375d6f45eb26321b257b674250e77 --- /dev/null +++ b/fengshen/models/deepVAE/deep_vae.py @@ -0,0 +1,258 @@ +# coding=utf-8 +# Copyright 2022 IDEA-CCNL The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Della model. """ + +import torch +import torch.nn as nn +import torch.nn.functional as F +from dataclasses import dataclass +from typing import Optional, Tuple +from transformers.modeling_outputs import ModelOutput +from transformers.modeling_utils import PreTrainedModel +from fengshen.models.deepVAE.configuration_della import DellaModelConfig +from fengshen.models.deepVAE.latent_connector import GPT2ForDecoderLatentConnector, GPT2ForEncoderLatentConnector +from fengshen.models.deepVAE.utils import connect, compute_kl_loss, top_k_top_p_filtering, enforce_repetition_penalty + + +_CHECKPOINT_FOR_DOC = "della-226M-base" +_CONFIG_FOR_DOC = "DellaModelConfig" +_TOKENIZER_FOR_DOC = "BertTokenizer" +Della_model_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "della-226M-base" +] + + +@dataclass +class DellaModelOutput(ModelOutput): + logits: torch.FloatTensor = None + posterior_latents: Optional[Tuple[torch.FloatTensor]] = None + prior_latent: Optional[Tuple[torch.FloatTensor]] = None + + +class latent_layer(nn.Module): + def __init__(self, input_dim) -> None: + super().__init__() + self.W_hh = nn.Linear(input_dim, input_dim, bias=False) + self.W_ih = nn.Linear(input_dim, input_dim, bias=False) + self.tanh = nn.Tanh() + + def forward(self, z_lt_lm1, z_lm1): + # inputs are z_